# Table of contents

[1. Loading Libraries](#loading)

[2. Gathering Data](#gathering)
  * [A. Importing Enhanced Twitter Archive](#T-archive)
   
  * [B. Downloading the The tweet image predictions](#predictions)
   
  * [C. Getting Tweet Data From Twitter API](#T-api)
   
<ul>
<li><a href="#loading">1. Loading Libraries</a></li>
<li><a href="#gathering">2. Gathering Data</a>
    <ul>
        <li><a href="#T-archive">A. Importing Twitter Archive</a></li>
        <li><a href="#predictions">B. Downloading the The tweet image predictions</a></li>
        <li><a href="#T-api">C. Getting Data From Twitter API </a></li>
    </ul>
<li><a href="#assess">3. Assess the Data</a>
    <ul>
        <li><a href="#archive-a">`archive` table</a></li>
        <li><a href="#predictions-a">`predictions` table</a></li>
        <li><a href="#api-a">`api_data` table</a></li>
        <li><a href="#inclusion">Data Inclusion Criteria</a></li>
        <li><a href="#findings">Findings</a></li>
    </ul>
<li><a href="#clean">4. Clean the Data</a></li>
<li><a href="#analysis">5. Analysis & Visualization</a></li>
<li><a href="#conclusions">6. Conclusions</a></li>
</ul>

<a id='loading'></a>
## 1. Loading Libraries

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import requests
import json

<a id='Gathering'></a>
## 2. Gathering Data

<a id='T-archive'></a>

### A. Importing Enhanced Twitter Archive

In [2]:
t_archive = pd.read_csv('twitter-archive-enhanced.csv')
t_archive.head(1685)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,
5,891087950875897856,,,2017-07-29 00:08:17 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a majestic great white breaching ...,,,,https://twitter.com/dog_rates/status/891087950...,13,10,,,,,
6,890971913173991426,,,2017-07-28 16:27:12 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Jax. He enjoys ice cream so much he gets ...,,,,"https://gofundme.com/ydvmve-surgery-for-jax,ht...",13,10,Jax,,,,
7,890729181411237888,,,2017-07-28 00:22:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",When you watch your owner call another dog a g...,,,,https://twitter.com/dog_rates/status/890729181...,13,10,,,,,
8,890609185150312448,,,2017-07-27 16:25:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Zoey. She doesn't want to be one of th...,,,,https://twitter.com/dog_rates/status/890609185...,13,10,Zoey,,,,
9,890240255349198849,,,2017-07-26 15:59:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Cassie. She is a college pup. Studying...,,,,https://twitter.com/dog_rates/status/890240255...,14,10,Cassie,doggo,,,


<a id='predictions'></a>

### B. Downloading the  Tweet Image Predictions

In [3]:

url = "https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
response = requests.get(url)

with open('image-predictions.tsv', mode ='wb') as file:
    file.write(response.content)

i_predictions = pd.read_csv('image-predictions.tsv', sep='\t' )
i_predictions.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


<a id='T-api'></a>
### C. Getting Tweet Data From Twitter API

In [None]:
# Paste code from twitter_api.py here before submission. DO NOT RUN IT

In [None]:
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer

# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
consumer_key = 'HIDDEN'
consumer_secret = 'HIDDEN'
access_token = 'HIDDEN'
access_secret = 'HIDDEN'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

# NOTE TO STUDENT WITH MOBILE VERIFICATION ISSUES:
# df_1 is a DataFrame with the twitter_archive_enhanced.csv file. You may have to
# change line 17 to match the name of your DataFrame with twitter_archive_enhanced.csv
# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor
# Tweet IDs for which to gather additional data via Twitter's API
tweet_ids = t_archive.tweet_id.values
len(tweet_ids)

# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    for tweet_id in tweet_ids:
        count += 1
        print(str(count) + ": " + str(tweet_id))
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            print("Success")
            json.dump(tweet._json, outfile)
            outfile.write('\n')
        except tweepy.TweepError as e:
            print("Fail")
            fails_dict[tweet_id] = e
            pass
end = timer()
print(end - start)
print(fails_dict)

In [22]:
df = pd.read_fwf('tweet_json.txt', header=None)


In [23]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,59,60,61,62,63,64,65,66,67,68
0,"{""created_at"":","""Tue",Aug,1,16:23:56,0,"2017"",","""id"":",892420643555336193,"""id_str"":",...,,,,,,,,,,
1,"{""created_at"":","""Tue",Aug,1,00:17:27,0,"2017"",","""id"":",892177421306343426,"""id_str"":",...,,,,,,,,,,
2,"{""created_at"":","""Mon",Jul,31,00:18:03,0,"2017"",","""id"":",891815181378084864,"""id_str"":",...,,,,,,,,,,
3,"{""created_at"":","""Sun",Jul,30,15:58:51,0,"2017"",","""id"":",891689557279858688,"""id_str"":",...,,,,,,,,,,
4,"{""created_at"":","""Sat",Jul,29,16:00:24,0,"2017"",","""id"":",891327558926688256,"""id_str"":",...,,,,,,,,,,


In [6]:
df_list = []
with open('tweet-json.txt') as json_file:
    data = json.load(json_file)
    for tweet in data:
        df_list.append({'tweet_id': tweet['id'],
                        'retweet_count': tweet['retweet_count'], 
                        'favorite_count': tweet['favorite_count']})

JSONDecodeError: Extra data: line 2 column 1 (char 3974)

In [None]:
tweet_data = pd.DataFrame(df_list, columns = ['tweet_id', 
                                            'retweet_count', 
                                            'favorite_count'])