## Gather

In [1]:
import pandas as pd
import numpy as np
import requests
import os
import tweepy
import json

In [2]:
# Gathering the WeRateDogs Twitter archive
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')

In [3]:
# Gathering the tweet image predictions file (image_predictions.tsv)
folder_name = 'image_predictions'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

with open(os.path.join(folder_name,
                      url.split('/')[-1]), mode='wb') as file:
    file.write(response.content)

In [4]:
# Gathering additional data: retweet count and favorite count by:

    # passing the key's and tokens values and creating API object
consumer_key = 'MY_CONSUMER_KEY'
consumer_secret = 'MY_CONSUMER_SECRET'
access_token = 'MY_ACCESS_TOKEN'
access_secret = 'MY_ACCESS_SECRET'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [6]:
api_counts = {}

# creating a for loop which queries the tweet ids from the tweet archive and passes them into the 
# api.get_status function to collect the retweet and favorite counts, and appends all three values into 
# api_counts 
for i in range(len(twitter_archive)):
    try:
        tweet_id = twitter_archive.tweet_id[i]
        tweet = api.get_status(tweet_id,tweet_mode='extended')
        api_counts[int(tweet_id)] = {
            'retweet_count':tweet._json['retweet_count'],
            'favorite_count':tweet._json['favorite_count']
        }
        if i % 100 == 0: print(f"{i} tweets processed.")
    except tweepy.TweepError:
        print(f"Tweet with ID '{tweet_id}' does not exist.")

0 tweets processed.
Tweet with ID '888202515573088257' does not exist.
Tweet with ID '873697596434513921' does not exist.
100 tweets processed.
Tweet with ID '872668790621863937' does not exist.
Tweet with ID '872261713294495745' does not exist.
Tweet with ID '869988702071779329' does not exist.
Tweet with ID '866816280283807744' does not exist.
Tweet with ID '861769973181624320' does not exist.
Tweet with ID '856602993587888130' does not exist.
200 tweets processed.
Tweet with ID '851953902622658560' does not exist.
Tweet with ID '851464819735769094' does not exist.
Tweet with ID '845459076796616705' does not exist.
Tweet with ID '844704788403113984' does not exist.
Tweet with ID '842892208864923648' does not exist.
Tweet with ID '837366284874571778' does not exist.
Tweet with ID '837012587749474308' does not exist.
300 tweets processed.
Tweet with ID '830956169170665475' does not exist.
Tweet with ID '829374341691346946' does not exist.
Tweet with ID '827324948884643840' does not exi

Rate limit reached. Sleeping for: 241


1200 tweets processed.
1300 tweets processed.
Tweet with ID '704871453724954624' does not exist.
1400 tweets processed.
1500 tweets processed.
1600 tweets processed.
1700 tweets processed.
Tweet with ID '680055455951884288' does not exist.
1800 tweets processed.
1900 tweets processed.
2000 tweets processed.


Rate limit reached. Sleeping for: 522


2100 tweets processed.
2200 tweets processed.
2300 tweets processed.


In [7]:
# Writing the api_counts object into the json file
with open("tweet_json.txt", "w") as outfile:
    for key in api_counts.keys():
        value = api_counts[key]
        json.dump({key: value}, outfile)
        outfile.write('\n')

In [8]:
# Reading line by line the tweet_json.txt file into an empty dict
counts = {}
for i, line in enumerate(open('tweet_json.txt', 'r')):
    line_data = json.loads(line)
    key = list(line_data.keys())[0]
    value = line_data[key]
    value['tweet_id'] = key
    counts[i] = value

In [9]:
counts

{0: {'retweet_count': 7664,
  'favorite_count': 36013,
  'tweet_id': '892420643555336193'},
 1: {'retweet_count': 5664,
  'favorite_count': 31076,
  'tweet_id': '892177421306343426'},
 2: {'retweet_count': 3758,
  'favorite_count': 23382,
  'tweet_id': '891815181378084864'},
 3: {'retweet_count': 7836,
  'favorite_count': 39304,
  'tweet_id': '891689557279858688'},
 4: {'retweet_count': 8437,
  'favorite_count': 37527,
  'tweet_id': '891327558926688256'},
 5: {'retweet_count': 2829,
  'favorite_count': 18908,
  'tweet_id': '891087950875897856'},
 6: {'retweet_count': 1843,
  'favorite_count': 10991,
  'tweet_id': '890971913173991426'},
 7: {'retweet_count': 17113,
  'favorite_count': 60690,
  'tweet_id': '890729181411237888'},
 8: {'retweet_count': 3893,
  'favorite_count': 26024,
  'tweet_id': '890609185150312448'},
 9: {'retweet_count': 6656,
  'favorite_count': 29736,
  'tweet_id': '890240255349198849'},
 10: {'retweet_count': 6643,
  'favorite_count': 28607,
  'tweet_id': '89000660

In [10]:
# Converting the above created dict into a data frame
counts_df = pd.DataFrame(counts)
counts_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2315,2316,2317,2318,2319,2320,2321,2322,2323,2324
retweet_count,7664,5664,3758,7836,8437,2829,1843,17113,3893,6656,...,53,124,220,772,51,40,130,41,42,460
favorite_count,36013,31076,23382,39304,37527,18908,10991,60690,26024,29736,...,105,268,407,1121,124,96,271,112,121,2399
tweet_id,892420643555336193,892177421306343426,891815181378084864,891689557279858688,891327558926688256,891087950875897856,890971913173991426,890729181411237888,890609185150312448,890240255349198849,...,666058600524156928,666057090499244032,666055525042405380,666051853826850816,666050758794694657,666049248165822465,666044226329800704,666033412701032449,666029285002620928,666020888022790149


In [11]:
counts_df = counts_df.T
counts_df

Unnamed: 0,retweet_count,favorite_count,tweet_id
0,7664,36013,892420643555336193
1,5664,31076,892177421306343426
2,3758,23382,891815181378084864
3,7836,39304,891689557279858688
4,8437,37527,891327558926688256
...,...,...,...
2320,40,96,666049248165822465
2321,130,271,666044226329800704
2322,41,112,666033412701032449
2323,42,121,666029285002620928
