In [1]:
# library imports
import pandas as pd
import numpy as np
import requests
import tweepy
import json
import re
import matplotlib.pyplot as plt

In [2]:
# read twitter-archive-enhanced csv to df
archive_df = pd.read_csv('Data/twitter-archive-enhanced.csv')

In [3]:
# download image predictions tsv file and read it to a df
dl_url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
filename = dl_url.split('/')[-1]

response = requests.get(dl_url)

with open('Data/' + filename, 'wb') as infile:
    infile.write(response.content)
    
prediction_df = pd.read_csv('Data/' + filename, sep='\t')

In [4]:
# read twitter config file
with open('twitter_config.json') as json_file:
    json_data = json.load(json_file)

# assign keys to variables
access_token = json_data['access_token']
access_token_secret = json_data['access_token_secret']
consumer_key = json_data['consumer_key']
consumer_secret = json_data['consumer_secret']

In [5]:
# creat api object
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

In [7]:
# get API response and store it into a list
tweet_ids = list(archive_df.tweet_id)
tweet_list = []
for tweet_id in tweet_ids:
    try:
        status = api.get_status(tweet_id, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
        tweet_list.append(status._json)
    except Exception as e:
        print(e, str(tweet_id))

2356

In [32]:
len(tweet_ids), len(tweet_list)

(2356, 2342)

In [31]:
# save it to tweet_json.txt
with open('Data/tweet_json.txt', 'w') as file:
    for i in tweet_list:
        file.write(json.dumps(i))
        file.write('\n')

In [27]:
# what to do with missing ids?

# missing_ids = ['754011816964026368','770743923962707968','771004394259247104','775096608509886464','802247111496568832',
#               '827228250799742977','837012587749474308','842892208864923648','845459076796616705','861769973181624320',
#               '866816280283807744','869988702071779329','873697596434513921','888202515573088257']

# for tweet_id in missing_ids:
#     try:
#         status = api_response.get_status(tweet_id, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
#         tweet_list.append(status._json)
#     except Exception as e:
#         print(e, str(tweet_id))

In [46]:
# read tweet_json.txt to store into DF
list_to_insert = []

with open('Data/tweet_json.txt', 'r') as infile:
    for i, line in enumerate(infile):
        tweet = json.loads(line)
        list_to_insert.append({'tweet_id': tweet['id'],
                            'retweets': tweet['retweet_count'],
                            'favorites': tweet['favorite_count']})

In [47]:
# check if the reading from file was successful
len(list_to_insert) == len(tweet_list)

True

In [53]:
tweet_df = pd.DataFrame(list_to_insert, columns=['tweet_id', 'retweets', 'favorites'])

In [89]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2342 entries, 0 to 2341
Data columns (total 3 columns):
tweet_id     2342 non-null int64
retweets     2342 non-null int64
favorites    2342 non-null int64
dtypes: int64(3)
memory usage: 55.0 KB


## Assess

We have now gathered 3 dataframes:
- `archive_df` from arhive downloaded from udacity 
- `prediction_df` from url
- `tweet_df` from twitter api response content

In [88]:
# create copies of each dataframes
archive_df_clean = archive_df.copy()
prediction_df_clean = prediction_df.copy()
tweet_df_clean = tweet_df.copy()

In [90]:
# merge archive_df and tweet_df
archive_tweet_df = pd.merge(left=archive_df_clean, right=tweet_df_clean, left_on='tweet_id', right_on='tweet_id', how='inner')
archive_tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2342 entries, 0 to 2341
Data columns (total 19 columns):
tweet_id                      2342 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2342 non-null object
source                        2342 non-null object
text                          2342 non-null object
retweeted_status_id           168 non-null float64
retweeted_status_user_id      168 non-null float64
retweeted_status_timestamp    168 non-null object
expanded_urls                 2283 non-null object
rating_numerator              2342 non-null int64
rating_denominator            2342 non-null int64
name                          2342 non-null object
doggo                         2342 non-null object
floofer                       2342 non-null object
pupper                        2342 non-null object
puppo                         2342 non-null object
retweets                      23

In [94]:
# check for retweets. Logic: non-null retweeded_status_id rows are the retweets
archive_tweet_df[~archive_tweet_df.retweeted_status_id.isnull()].count()

tweet_id                      168
in_reply_to_status_id         0  
in_reply_to_user_id           0  
timestamp                     168
source                        168
text                          168
retweeted_status_id           168
retweeted_status_user_id      168
retweeted_status_timestamp    168
expanded_urls                 167
rating_numerator              168
rating_denominator            168
name                          168
doggo                         168
floofer                       168
pupper                        168
puppo                         168
retweets                      168
favorites                     168
dtype: int64

In [61]:
archive_df[~archive_df.in_reply_to_status_id.isnull()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
30,886267009285017600,8.862664e+17,2.281182e+09,2017-07-15 16:51:35 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@NonWhiteHat @MayhewMayhem omg hello tanner yo...,,,,,12,10,,,,,
55,881633300179243008,8.816070e+17,4.738443e+07,2017-07-02 21:58:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@roushfenway These are good dogs but 17/10 is ...,,,,,17,10,,,,,
64,879674319642796034,8.795538e+17,3.105441e+09,2017-06-27 12:14:36 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@RealKentMurphy 14/10 confirmed,,,,,14,10,,,,,
113,870726314365509632,8.707262e+17,1.648776e+07,2017-06-02 19:38:25 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@ComplicitOwl @ShopWeRateDogs &gt;10/10 is res...,,,,,10,10,,,,,
148,863427515083354112,8.634256e+17,7.759620e+07,2017-05-13 16:15:35 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@Jack_Septic_Eye I'd need a few more pics to p...,,,,,12,10,,,,,
149,863079547188785154,6.671522e+17,4.196984e+09,2017-05-12 17:12:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Ladies and gentlemen... I found Pipsy. He may ...,,,,https://twitter.com/dog_rates/status/863079547...,14,10,,,,,
179,857214891891077121,8.571567e+17,1.806710e+08,2017-04-26 12:48:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@Marc_IRL pixelated af 12/10,,,,,12,10,,,,,
184,856526610513747968,8.558181e+17,4.196984e+09,2017-04-24 15:13:52 +0000,"<a href=""http://twitter.com/download/iphone"" r...","THIS IS CHARLIE, MARK. HE DID JUST WANT TO SAY...",,,,https://twitter.com/dog_rates/status/856526610...,14,10,,,,,
186,856288084350160898,8.562860e+17,2.792810e+08,2017-04-23 23:26:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@xianmcguire @Jenna_Marbles Kardashians wouldn...,,,,,14,10,,,,,
188,855862651834028034,8.558616e+17,1.943518e+08,2017-04-22 19:15:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@dhmontgomery We also gave snoop dogg a 420/10...,,,,,420,10,,,,,


In [69]:
# increase column width for better readability
pd.set_option('display.max_colwidth', -1)

In [71]:
archive_df.head(10)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU,,,,https://twitter.com/dog_rates/status/892420643555336193/photo/1,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV",,,,https://twitter.com/dog_rates/status/892177421306343426/photo/1,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB,,,,https://twitter.com/dog_rates/status/891815181378084864/photo/1,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ,,,,https://twitter.com/dog_rates/status/891689557279858688/photo/1,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Franklin. He would like you to stop calling him ""cute."" He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f",,,,"https://twitter.com/dog_rates/status/891327558926688256/photo/1,https://twitter.com/dog_rates/status/891327558926688256/photo/1",12,10,Franklin,,,,
5,891087950875897856,,,2017-07-29 00:08:17 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Here we have a majestic great white breaching off South Africa's coast. Absolutely h*ckin breathtaking. 13/10 (IG: tucker_marlo) #BarkWeek https://t.co/kQ04fDDRmh,,,,https://twitter.com/dog_rates/status/891087950875897856/photo/1,13,10,,,,,
6,890971913173991426,,,2017-07-28 16:27:12 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Meet Jax. He enjoys ice cream so much he gets nervous around it. 13/10 help Jax enjoy more things by clicking below\n\nhttps://t.co/Zr4hWfAs1H https://t.co/tVJBRMnhxl,,,,"https://gofundme.com/ydvmve-surgery-for-jax,https://twitter.com/dog_rates/status/890971913173991426/photo/1",13,10,Jax,,,,
7,890729181411237888,,,2017-07-28 00:22:40 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",When you watch your owner call another dog a good boy but then they turn back to you and say you're a great boy. 13/10 https://t.co/v0nONBcwxq,,,,"https://twitter.com/dog_rates/status/890729181411237888/photo/1,https://twitter.com/dog_rates/status/890729181411237888/photo/1",13,10,,,,,
8,890609185150312448,,,2017-07-27 16:25:51 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Zoey. She doesn't want to be one of the scary sharks. Just wants to be a snuggly pettable boatpet. 13/10 #BarkWeek https://t.co/9TwLuAGH0b,,,,https://twitter.com/dog_rates/status/890609185150312448/photo/1,13,10,Zoey,,,,
9,890240255349198849,,,2017-07-26 15:59:51 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Cassie. She is a college pup. Studying international doggo communication and stick theory. 14/10 so elegant much sophisticate https://t.co/t1bfwz5S2A,,,,https://twitter.com/dog_rates/status/890240255349198849/photo/1,14,10,Cassie,doggo,,,


In [80]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2342 entries, 0 to 2341
Data columns (total 3 columns):
tweet_id     2342 non-null int64
retweets     2342 non-null int64
favorites    2342 non-null int64
dtypes: int64(3)
memory usage: 55.0 KB


In [68]:
prediction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [85]:
# check for duplicates in prediction_df
for i in list(prediction_df.tweet_id.duplicated()):
    if i:
        print('duplicated')

In [73]:
# check for the float numerator
archive_df[archive_df.text.str.contains(r"(\d+\.\d*\/\d+)")]

  """Entry point for launching an IPython kernel.


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
45,883482846933004288,,,2017-07-08 00:28:19 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Bella. She hopes her smile made you smile. If not, she is also offering you her favorite monkey. 13.5/10 https://t.co/qjrljjt948",,,,"https://twitter.com/dog_rates/status/883482846933004288/photo/1,https://twitter.com/dog_rates/status/883482846933004288/photo/1",5,10,Bella,,,,
340,832215909146226688,,,2017-02-16 13:11:49 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","RT @dog_rates: This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wu…",7.867091e+17,4196984000.0,2016-10-13 23:23:56 +0000,https://twitter.com/dog_rates/status/786709082849828864/photo/1,75,10,Logan,,,,
695,786709082849828864,,,2016-10-13 23:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wuqaPS",,,,https://twitter.com/dog_rates/status/786709082849828864/photo/1,75,10,Logan,,,,
763,778027034220126208,,,2016-09-20 00:24:34 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Sophie. She's a Jubilant Bush Pupper. Super h*ckin rare. Appears at random just to smile at the locals. 11.27/10 would smile back https://t.co/QFaUiIHxHq,,,,https://twitter.com/dog_rates/status/778027034220126208/photo/1,27,10,Sophie,,,pupper,
1689,681340665377193984,6.813394e+17,4196984000.0,2015-12-28 05:07:27 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",I've been told there's a slight possibility he's checking his mirror. We'll bump to 9.5/10. Still a menace,,,,,5,10,,,,,
1712,680494726643068929,,,2015-12-25 21:06:00 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Here we have uncovered an entire battalion of holiday puppers. Average of 11.26/10 https://t.co/eNm2S6p9BD,,,,https://twitter.com/dog_rates/status/680494726643068929/photo/1,26,10,,,,,


tweet_id                      156
in_reply_to_status_id         0  
in_reply_to_user_id           0  
timestamp                     156
source                        156
text                          156
retweeted_status_id           156
retweeted_status_user_id      156
retweeted_status_timestamp    156
expanded_urls                 156
rating_numerator              156
rating_denominator            156
name                          156
doggo                         156
floofer                       156
pupper                        156
puppo                         156
dtype: int64