In [185]:
# ### Mount Drive

# from google.colab import drive
# drive.mount('/content/drive/')

In [186]:
### Imports

import os, requests, json
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
from sklearn import preprocessing
import sys

In [187]:
### Constants

WEIGHTS = {"retweet" : 3, "like" : 0.5 ,"quote" : 4 ,"reply" : 1}

### Requests functions

In [205]:
bearer_token = "AAAAAAAAAAAAAAAAAAAAAEqDUAEAAAAAPylQ7hO%2FoW9BuUjtiG608qAZJPg%3DKHFGBs8PtUg49u2TBLzina1UcfwkGtNuCJHlT55omuPiLUaIyi"

def response_health(r):
  if r.status_code != 200:
    raise Exception(
    "Request returned an error: {} {}".format(
      r.status_code, r.text
    )
  )
    
def bearer_oauth(r):
  r.headers["Authorization"] = f"Bearer {bearer_token}"
  return r

def send_request(url, params=None, print_status=False):
  '''Send Request (url) with optional params. Returns json'''
  # https://2.python-requests.org/en/master/api/#requests.request
  if params == None:
    response = requests.request("GET", url, auth=bearer_oauth)
  else:
    response = requests.request("GET", url, auth=bearer_oauth, params=params)
  if print_status: print("Request response status: ", response.status_code)
  response_health(response)
  return response.json()

### Twitter api functions

In [249]:

def get_fast_food_handles():
    df = pd.read_csv('fast-food-chains - Sheet1.csv')
    df.drop(df.columns[0], axis=1, inplace=True)
    df.loc[df.shape[0]] = df.columns
    df.columns = ["handle"]
    twitter_handles = df[df.columns[0]].tolist()
    return twitter_handles

def get_user_data(name):
  # data dictionary scroll down to response fields https://developer.twitter.com/en/docs/twitter-api/users/lookup/api-reference/get-users-by-username-username

  userFields = {"user.fields":"created_at, description, entities, id, location, name, pinned_tweet_id, profile_image_url, protected, public_metrics, url, username, verified, withheld".replace(" ", "")}
  user_json = send_request(f"https://api.twitter.com/2/users/by/username/{name}",params=userFields)
  user_json = user_json["data"]

  outputDict = {}
  outputDict['following_count'] = user_json['public_metrics']['following_count']
  outputDict['tweet_count'] = user_json['public_metrics']['tweet_count']
  outputDict['followers_count'] = user_json['public_metrics']['followers_count']
  outputDict['listed_count'] = user_json['public_metrics']['listed_count']
  outputDict['handle'] = user_json['username']
  outputDict['name'] = user_json['name']
  outputDict['id'] = user_json['id']
  outputDict['verified'] = user_json['verified']
  outputDict['protected'] = user_json['protected']
  outputDict['created_at'] = user_json['created_at']
  outputDict['description'] = user_json['description']

  try:
    test = user_json['pinned_tweet_id']
    outputDict['hasPinnedTweet'] = True
  except:
    outputDict['hasPinnedTweet'] = False
    pass
  try:
    outputDict['urlsInDescription'] = len(user_json['entities']['description']['urls'])
  except:
    outputDict['urlsInDescription'] = 0
    pass
  try:
    outputDict['hashtagsInDescription'] = len(user_json['entities']['description']['hashtags'])
  except:
    outputDict['hashtagsInDescription'] = 0
    pass

  try:
    outputDict['userWebsitesAdded'] = len(user_json['entities']['url']['urls'])
  except:
    outputDict['userWebsitesAdded'] = 0
    pass

  try:
    outputDict['cashtagsInDescription'] = len(user_json['entities']['description']['cashtags'])
  except:
    outputDict['cashtagsInDescription'] = 0
    pass

  try:
    outputDict['mentionsInDescription'] = len(user_json['entities']['description']['mentions'])
  except:
    outputDict['mentionsInDescription'] = 0
    pass

  
  return outputDict

def get_tweets_user(id, numTweets = 10, tweetsPerPage = 10, replies = False, weights=WEIGHTS, paginationToken = None):
    import math
    
    # (there are 10 results returned per page by default)
    if numTweets < tweetsPerPage:
        print("numTweets must be greater than or equal to the number of tweets per page.")
        return

    # to see data dictionary, click url and scroll down to response fields https://developer.twitter.com/en/docs/twitter-api/tweets/timelines/api-reference/get-users-id-tweets
    expansions = {"expansions":"author_id, attachments.poll_ids, attachments.media_keys, entities.mentions.username, geo.place_id, in_reply_to_user_id, referenced_tweets.id,referenced_tweets.id.author_id".replace(" ", "")}
    tweetFields = {"tweet.fields":"attachments, author_id, context_annotations, conversation_id, created_at, entities, geo, id, in_reply_to_user_id, lang, public_metrics, possibly_sensitive, referenced_tweets, reply_settings, source, text, withheld".replace(" ", "")}
    userFields = {"user.fields":"public_metrics,username"}
    replyFields = {"exclude": "replies"}

    outputDict = {'id':[],'handle':[],'followers':[], 'text':[], 'lang':[],'possibly_sensitive':[],'retweet_count':[],'reply_count':[],'like_count':[],'quote_count':[]
        ,'reply_settings':[],'source':[],'created_at':[],'is_retweet':[],'contains_quote':[],'is_reply':[],'num_referenced_tweets':[],
        'url_image':[],'num_hashtags':[],'text_first_hashtag':[],'num_mentions':[],'num_cashtags':[],'num_polls':[]}

    if not replies:
        outputDict['interaction_score'] = []

    token = ""
    
    if paginationToken != None: 
        token = paginationToken
        
    # for each page of results
    for i in range(math.ceil(numTweets/tweetsPerPage)): 
        if i != 0 or paginationToken != None:
            if replies:
                params = {**{'max_results':tweetsPerPage}, **expansions,**tweetFields,**userFields , **{"pagination_token":token}}
            else:
                params = {**{'max_results':tweetsPerPage}, **expansions,**tweetFields,**userFields , **{"pagination_token":token}, **replyFields}
        else:
            if replies:
                params = {**{'max_results':tweetsPerPage}, **expansions,**tweetFields, **userFields}
            else:
                params = {**{'max_results':tweetsPerPage}, **expansions,**tweetFields, **userFields, **replyFields}


        tweet_json = (send_request(f"https://api.twitter.com/2/users/{id}/tweets", params=params))

        # handle cases where there is no next page of tweets to grab
        try:
            token = tweet_json['meta']['next_token']
        except: 
            token = None # in this case, there is no page to grab next.
            pass
        if token == None:
            break

        username = tweet_json['includes']['users'][0]['username']
        followers = tweet_json['includes']['users'][0]['public_metrics']['followers_count']
        
        tweetData = tweet_json['data']
        for tweet in tweetData:
            outputDict['handle'].append(username)
            outputDict['followers'].append(followers)
            outputDict['id'].append(tweet['id'])
            outputDict['text'].append(tweet['text'])
            outputDict['lang'].append(tweet['lang'])
            outputDict['possibly_sensitive'].append(tweet['possibly_sensitive'])

            outputDict['retweet_count'].append(tweet['public_metrics']['retweet_count'])
            outputDict['reply_count'].append(tweet['public_metrics']['reply_count'])
            outputDict['like_count'].append(tweet['public_metrics']['like_count'])
            outputDict['quote_count'].append(tweet['public_metrics']['quote_count'])
            outputDict['reply_settings'].append(tweet['reply_settings'])
            outputDict['source'].append(tweet['source'])
            outputDict['created_at'].append(tweet['created_at'])

            if not replies:
                outputDict['interaction_score'].append((weights['retweet'] *outputDict['retweet_count'][-1] + weights['like'] *outputDict['like_count'][-1]+
                                                        weights['reply'] *outputDict['reply_count'][-1] + weights['quote'] *outputDict['quote_count'][-1])/followers)

            # referenced tweets: quotes, replies, and retweets
            try:
                refdTweets = tweet['referenced_tweets']
                outputDict['num_referenced_tweets'].append(len(refdTweets))

                rtweet = False
                reply = False
                quote = False
                # there may be multiple referenced tweets, apparently. So it could be a reply and contain a quote, I guess
                for t in refdTweets:
                    typ = t['type']
                    if typ == 'retweeted':
                        outputDict['is_retweet'].append(True)
                        rtweet = True
                    elif typ == 'quoted':
                        outputDict['contains_quote'].append(True)
                        quote = True
                    elif typ == 'replied_to':
                        outputDict['is_reply'].append(True)
                        reply = True
                        
                if not rtweet:
                        outputDict['is_retweet'].append(False)
                if not reply:
                        outputDict['is_reply'].append(False)
                if not quote:
                        outputDict['contains_quote'].append(False)
            except:
                outputDict['num_referenced_tweets'].append(0)
                outputDict['is_retweet'].append(False)
                outputDict['contains_quote'].append(False)
                outputDict['is_reply'].append(False)


            # image
            try:
                outputDict['url_image'].append(tweet['entities']['urls'][0]['images'][0]['url'])  #just grabbing the first image in the first url
            except:
                outputDict['url_image'].append("")

            # hashtags
            try:
                outputDict['num_hashtags'].append(len(tweet['entities']['hashtags']))
                # grabbing just the first hashtag
                outputDict['text_first_hashtag'].append(tweet['entities']['hashtags'][0]['tag'])
            except:
                outputDict['num_hashtags'].append(0) 
                outputDict['text_first_hashtag'].append("")

            # mentions
            try: outputDict['num_mentions'].append(len(tweet['entities']['mentions']))
            except: outputDict['num_mentions'].append(0)

            # cashtags
            try: outputDict['num_cashtags'].append(len(tweet['entities']['cashtags']))
            except: outputDict['num_cashtags'].append(0)
            
            # polls
            try:
                outputDict['num_polls'].append(len(tweet['attachments']['poll_ids']))
            except:
                outputDict['num_polls'].append(0) 
                pass
        
    df = pd.DataFrame(outputDict)


    return df, token

def get_api_data(usernames, replies = False, numTweets = 10, tweetsPerPage = 10, print_status = False):
    import time
    import traceback
    usersDict =  {'following_count':[], 'tweet_count':[], 'followers_count':[], 'listed_count':[], 'handle':[], 'name':[], 'id':[], 
    'verified': [], 'protected': [],'created_at': [],'description': [], 'hasPinnedTweet':[], 'urlsInDescription':[], 'hashtagsInDescription':[],
    'userWebsitesAdded':[], 'cashtagsInDescription':[],'mentionsInDescription':[]}

    idDict = {}

    for username in usernames:
        # get data related to user account
        userData = get_user_data(username)
        
        # create id - username mapping dictionary for use in next loop
        idDict[userData['handle']] = userData['id']
        
        # build the user data dataframe
        for k, v in userData.items():
            usersDict[k].append(v)

    df_user = pd.DataFrame(usersDict)
    
    if not replies:
        #if we are excluding replies, we want to keep pulling until we have at least 500 images
        numImages = 0
        tokenDict = {}
        stopList = []
        nextToken = ""
        firstIteration = True
        loopCount = 1
        df_tweets = None
        while (numImages < 500):
            for username in usernames:
                # get data from tweets of the user

                # handle cases where there are no more tweets to pull 
                if set(stopList) == set(usernames):
                    df_tweets.set_index('id', inplace=True)
                    df_user.set_index('handle', inplace=True)
                    df_tweets.sort_values('handle', inplace=True)
                    return df_tweets, df_user
                elif stopList.count(username) > 0:
                    continue

                # if we left off on a page, then jump to the next page of tweet results
                try:
                    nextToken = tokenDict[username]
                    try:
                        df_sub,token = get_tweets_user(idDict[username], numTweets=numTweets, tweetsPerPage=tweetsPerPage, replies=replies, paginationToken=nextToken)
                    except Exception:
                        traceback.print_exc()
                except: 
                    df_sub,token = get_tweets_user(idDict[username], numTweets=numTweets, tweetsPerPage=tweetsPerPage, replies=replies)

                tokenDict[username] = token

                # handle cases where there are no more tweets to pull 
                if token == None:
                    stopList.append(username)

                if not firstIteration:
                    df_tweets = pd.concat([df_tweets, df_sub])
                else:
                    df_tweets = df_sub
                    firstIteration = False
                if print_status: 
                    num_tweets = len(df_sub)
                    print(f"Retrieved {num_tweets} tweets for: {username}")
            numImages = len(df_tweets[df_tweets.url_image != ""])
            if print_status: print(f"{loopCount} iterations through while loop. {numImages} images retrieved.")
            loopCount += 1
    else:
        df_tweets = None
        firstIteration = True
        for username in usernames:
            # get data from tweets of the user
            df_sub = get_tweets_user(idDict[username], numTweets=numTweets, tweetsPerPage=tweetsPerPage, replies=replies)[0]

            if not firstIteration:
                df_tweets = pd.concat([df_tweets, df_sub])
            else:
                df_tweets = df_sub
                firstIteration = False
            if print_status: 
                num_tweets = len(df_sub)
                print(f"Retrieved {num_tweets} tweets for: {username}")
        

    df_tweets.set_index('id', inplace=True)
    df_user.set_index('handle', inplace=True)
    df_tweets.sort_values('handle', inplace=True)

    return df_tweets, df_user

def dataRetrieval(method = 'csv'):
    methods = ['csv', 'api']
    if method not in methods:
        raise ValueError("Invalid method. Expected one of: %s" % methods)

    if method == 'api':
        twitter_handles = get_fast_food_handles()
        reply_tweetDf, userDf = get_api_data(twitter_handles, replies = True, numTweets = 80, tweetsPerPage = 80, print_status=True)
        noreply_tweetDf = get_api_data(twitter_handles, replies = False, numTweets = 300, tweetsPerPage = 100,print_status=True)[0]

        noreply_tweetDf.to_csv('noReplies.csv')
        reply_tweetDf.to_csv('replies.csv')
        userDf.to_csv('userData.csv')

        return userDf, reply_tweetDf, noreply_tweetDf
    elif method == 'csv':
        userDf = pd.read_csv('userData.csv')
        reply_tweetDf = pd.read_csv('replies.csv')
        noreply_tweetDf = pd.read_csv('noReplies.csv')

        return userDf, reply_tweetDf, noreply_tweetDf

In [261]:
userDf, reply_tweetDf, noreply_tweetDf = dataRetrieval(method = 'csv') # use method = api to pull from api and overwrite csv files

### adding features for individual tweets

In [264]:
def addWordCount(df):

    wordCountData = []
    for row, index in df['text'].items():
        text = df.at[row,'text']
        ltext = text.split()
        wordCount = len(ltext)
        wordCountData.append(wordCount)
    
    df['tweet_word_count'] = wordCountData

    return df

reply_tweetDf = addWordCount(reply_tweetDf)



### aggregation features

In [294]:
def aggregationFeatures(df, groupByFeature): 

    numericCols = [groupByFeature]
    categoricalCols = [groupByFeature]
    df2 = df.drop(columns=groupByFeature)

    for col in df2.columns:
        if pd.api.types.is_numeric_dtype(df2[col]):
            numericCols.append(col)
        else:   
            categoricalCols.append(col)
    
    # numeric
    dfNumeric = df[numericCols]
    dfNumeric = dfNumeric.groupby(groupByFeature).mean()
    
    # categorical
    dfCategorical = df[categoricalCols]
    dfCategorical = dfCategorical.groupby(groupByFeature).agg(pd.Series.mode)

    df = dfNumeric.join(dfCategorical)
    return df

reply_tweetDf = aggregationFeatures(reply_tweetDf, 'handle')

In [296]:
reply_tweetDf.columns

Index(['id', 'followers', 'possibly_sensitive', 'retweet_count', 'reply_count',
       'like_count', 'quote_count', 'is_retweet', 'contains_quote', 'is_reply',
       'num_referenced_tweets', 'num_hashtags', 'num_mentions', 'num_cashtags',
       'num_polls', 'tweet_word_count', 'text', 'lang', 'reply_settings',
       'source', 'created_at', 'url_image', 'text_first_hashtag'],
      dtype='object')

In [291]:
df.to_csv('aggregated.csv')

In [309]:
len(noreply_tweetDf[noreply_tweetDf.url_image != None])

99993

In [307]:
noreply_tweetDf.url_image.value_counts(dropna=False)

NaN                                                                                 99484
https://pbs.twimg.com/news_img/1467491365294678035/vnHxFxoJ?format=jpg&name=orig       46
https://pbs.twimg.com/news_img/1469694220541874184/mCPLpaR_?format=jpg&name=orig       38
https://pbs.twimg.com/news_img/1465356646725693444/tle7cdhM?format=jpg&name=orig       13
https://pbs.twimg.com/news_img/1471250752526077961/Vr5pfjGs?format=jpg&name=orig       12
                                                                                    ...  
https://pbs.twimg.com/news_img/1460350739260133376/8Jebso6v?format=png&name=orig        1
https://pbs.twimg.com/news_img/1443731111587336193/cxHASGB1?format=jpg&name=orig        1
https://pbs.twimg.com/news_img/1448507807871815683/zIg9G_Y2?format=jpg&name=orig        1
https://pbs.twimg.com/news_img/1426326740054466564/aFdeO3O_?format=jpg&name=orig        1
https://pbs.twimg.com/news_img/1458494914342359044/lBk25tAu?format=jpg&name=orig        1
Name: url_

In [251]:
noreply_tweetDf.duplicated().sum()

0

In [252]:
noreply_tweetDf.shape[0]

99993

In [246]:
reply_tweetDf.columns

Index(['handle', 'followers', 'text', 'lang', 'possibly_sensitive',
       'retweet_count', 'reply_count', 'like_count', 'quote_count',
       'reply_settings', 'source', 'created_at', 'is_retweet',
       'contains_quote', 'is_reply', 'num_referenced_tweets', 'url_image',
       'num_hashtags', 'text_first_hashtag', 'num_mentions', 'num_cashtags',
       'num_polls'],
      dtype='object')

In [209]:
noreply_tweetDf

Unnamed: 0_level_0,handle,followers,text,lang,possibly_sensitive,retweet_count,reply_count,like_count,quote_count,reply_settings,...,contains_quote,is_reply,num_referenced_tweets,url_image,num_hashtags,text_first_hashtag,num_mentions,num_cashtags,num_polls,interaction_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1221160812389203969,Applebees,599611,Hurry in before the blue moon is gone. Not tha...,en,False,7,11,49,1,everyone,...,False,False,0,,0,,0,0,0,0.000101
1106316070821003265,Applebees,599611,cool me neither,en,False,23,4,63,3,everyone,...,False,False,0,,0,,0,0,0,0.000194
1106314060310679552,Applebees,599611,have you ever thought about how reading out lo...,en,False,7,4,50,2,everyone,...,False,False,0,,0,,0,0,0,0.000097
1106311542570713089,Applebees,599611,so did St. Patrick like...invent beer?,en,False,20,3,79,4,everyone,...,False,False,0,,0,,0,0,0,0.000198
1106309275612311552,Applebees,599611,"Once I found a three-leaf clover, and then I g...",en,False,7,3,47,3,everyone,...,False,False,0,,0,,0,0,0,0.000099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1228406806805909504,wingstop,263466,"RT @wingstop: @Doordash You have our heart, mi...",en,False,4,0,0,0,everyone,...,False,False,1,,1,BeOurBrandValentine,2,0,0,0.000046
1228405530563092488,wingstop,263466,#Valentinesday isn’t complete without Wings in...,en,False,19,6,91,3,everyone,...,True,False,1,,2,Valentinesday,0,0,0,0.000457
1228399425615208448,wingstop,263466,"RT @wingstop: Today we’re shooting our shot, s...",en,False,15,0,0,0,everyone,...,False,False,1,,1,BeOurBrandValentine,1,0,0,0.000171
1232792065056047110,wingstop,263466,Our bags hold a lot of stuff if you need some ...,en,False,17,6,139,3,everyone,...,True,False,1,,0,,0,0,0,0.000526


In [211]:
for name, count in noreply_tweetDf.handle.value_counts().items():
    print(f"{name} {count}")

qdoba 800
redlobster 800
bjsrestaurants 800
Zippys 800
culvers 800
dominos 800
Hooters 800
DennysDiner 800
YardHouse 800
kogibbq 800
krispykreme 800
TimHortons 800
Chilis 800
LongHornSteaks 800
Cheesecake 800
raisingcanes 800
FirehouseSubs 800
TGIFridays 800
Schlotzskys 800
PFChangs 800
Moes_HQ 800
umamiburger 800
BaskinRobbins 800
SUBWAY 800
SaltgrassSteak 800
McAlistersDeli 800
BurgerKing 800
shakeshack 800
rubytuesday 800
ShakeysUSA 800
SteaknShake 800
FiveGuys 799
WhiteCastle 799
Wendys 799
JohnnyRockets 799
Applebees 799
Dickeys 799
CaptainDs 799
ChickfilA 799
BWWings 799
BlazePizza 799
tacobell 799
Bojangles 799
CapitalGrille 799
calpizzakitchen 799
CarlsJr 799
Popeyes 799
pollotropical 799
habitburger 799
PapaJohns 799
olivegarden 798
SmoothieKing 798
noodlescompany 798
sonicdrivein 798
eatatjacks 798
cheddarskitchen 798
RoundTablePizza 798
Quiznos 798
wienerschnitzel 798
bostonmarket 798
wingstop 798
DelTaco 798
PandaExpress 798
JetsPizza 798
IHOP 798
MODPizza 798
ElPolloLoco 7

In [212]:
len(noreply_tweetDf[noreply_tweetDf.url_image != ""])

509

In [213]:
noreply_tweetDf.columns

Index(['handle', 'followers', 'text', 'lang', 'possibly_sensitive',
       'retweet_count', 'reply_count', 'like_count', 'quote_count',
       'reply_settings', 'source', 'created_at', 'is_retweet',
       'contains_quote', 'is_reply', 'num_referenced_tweets', 'url_image',
       'num_hashtags', 'text_first_hashtag', 'num_mentions', 'num_cashtags',
       'num_polls', 'interaction_score'],
      dtype='object')

In [235]:
fun = noreply_tweetDf.copy()

# fun  = fun.drop(columns=['text', 'lang',  'reply_settings','source','text_first_hashtag', 'url_image', 'created_at'])
fun2  = fun.drop(columns=['followers', 'text', 'lang', 'possibly_sensitive',
       'retweet_count', 'reply_count', 'like_count', 'quote_count',
       'reply_settings', 'source', 'created_at', 'is_retweet',
       'contains_quote', 'is_reply', 'num_referenced_tweets', 'url_image',
       'num_hashtags', 'text_first_hashtag', 'num_mentions', 'num_cashtags',
       'num_polls'])
fun2.columns
fun2 = fun2.groupby('handle').mean()
fun2.idxmax()

interaction_score    PandaExpress
dtype: object