# Wrangle and Analyze Data

## Preliminaries

Load packages and libraries required for subsequent blocks.

In [58]:
# import packages
import pandas as pd
import numpy as np
import requests
import tweepy
import json
import os
from timeit import default_timer as timer

## Gather Data

Gather data need for analyzing We Rate Dogs (@dog_rates) twitter feed. There are three files comprising the broader data set.

### File on-hand: Twitter Archive

This file was presented on and downloaded from the Udacity website. So it is simply read into the dataframe using the Pandas 'read_csv' function.

In [3]:
# read dog_rates twitter archive
df = pd.read_csv('twitter-archive-enhanced.csv')

In [37]:
# printing a sample of the dataframe allows for visual inspection.
df.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
175,857989990357356544,,,2017-04-28 16:08:49 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Rosie. She was just informed of the wa...,,,,https://twitter.com/dog_rates/status/857989990...,12,10,Rosie,,,,
829,768909767477751808,,,2016-08-25 20:35:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: When it's Janet from accounting...,7.001438e+17,4196984000.0,2016-02-18 02:24:13 +0000,https://twitter.com/dog_rates/status/700143752...,10,10,,,,pupper,
992,748692773788876800,,,2016-07-01 01:40:41 +0000,"<a href=""http://twitter.com/download/iphone"" r...",That is Quizno. This is his beach. He does not...,,,,https://twitter.com/dog_rates/status/748692773...,10,10,his,doggo,,,
503,813066809284972545,,,2016-12-25 17:00:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tyr. He is disgusted by holiday traffi...,,,,https://twitter.com/dog_rates/status/813066809...,12,10,Tyr,,,,
2166,669363888236994561,,,2015-11-25 03:56:01 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a Gingivitis Pumpernickel named Z...,,,,https://twitter.com/dog_rates/status/669363888...,10,10,,,,,


In [38]:
# using the pandas info() function shows all the variables, their data-types and gives us 
# a count of the entries and missing values.

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

### Download File from Server: Tweet Image Predictions

For the tweet image predictions file, we download the file from a Udacity server using the 'requests'library. This file is then written (saved) to the project directory.

In [19]:
# download tweet image predictions from server - save into workspace
image_prediction = {'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'}
# 'for' loop downloads all files at URL list in ebert_review_urls variable
for url in image_prediction:
    response = requests.get(url)
    with open(os.path.join(url.split('/')[-1]), mode = 'wb') as file:
        file.write(response.content)

We can verify that the download occurred by viewing the project directory to confirm there is an 'image_predictions.csv' file in it

In [20]:
os.listdir()

['wrangle_act.ipynb',
 'twitter-archive-enhanced.csv',
 'README.md',
 '.ipynb_checkpoints',
 'image-predictions.tsv',
 '.git']

We then open the file using the 'read_csv' function with the tab-separator indicated.

In [21]:
df2 = pd.read_csv('image-predictions.tsv',sep='\t')

Next, we perform visual inspection and examine the fields using the info() function, same as with the tweet archive file.

In [22]:
df2.sample(5)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1804,832215726631055365,https://pbs.twimg.com/media/CwJR1okWIAA6XMp.jpg,1,Afghan_hound,0.274637,True,borzoi,0.142204,True,doormat,0.109677,False
1904,852189679701164033,https://pbs.twimg.com/media/C9OV99SXsAEmj1U.jpg,1,barrow,0.42315,False,Bernese_mountain_dog,0.415374,True,EntleBucher,0.067345,True
847,695409464418041856,https://pbs.twimg.com/media/CaaXN5LUYAEzAh-.jpg,1,pug,0.997445,True,bull_mastiff,0.001749,True,Pekinese,0.000304,True
1228,745422732645535745,https://pbs.twimg.com/media/ClhGBCAWIAAFCsz.jpg,1,Labrador_retriever,0.6638,True,golden_retriever,0.308261,True,ice_bear,0.004269,False
1544,791672322847637504,https://pbs.twimg.com/media/CvyVxQRWEAAdSZS.jpg,1,golden_retriever,0.705092,True,Labrador_retriever,0.219721,True,kuvasz,0.015965,True


In [39]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


### Create File Dynamically Using Tweepy to Query Twitter API: tweet_json.txt File

The third file needed for this project containing information such as 'retweet' and 'favorite' counts is created dynamically by querying the Twitter api.

First step here took place elsewhere. At apps.twitter.com, I created a (dummy) app which enabled me to create authorization keys as required by Twitter.

Second step is to enter code as indicated in the [Tweepy documentation](http://tweepy.readthedocs.io/en/v3.6.0/getting_started.html#introduction) authenticating access to the API.

In [25]:
# Tweepy auth stuff
# place keys here - delete manually for now

In [26]:
# Tweepy auth stuff continued
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

The next step is for Tweepy to access twitter data for the 2356 tweets in the tweet_archive file and to download that data as a json file

*Still figuring this one out* code below is experimental

In [51]:
# Tweepy api call - testing
tweet = api.get_status(776201521193218049)
tweet.text

"This is O'Malley. That is how he sleeps. Doesn't care what you think about it. 10/10 comfy af https://t.co/Pq150LeRaC"

In [43]:
# put all of the tweetids from the twitter_archive csv/dataframe into a list
tweetids = df['tweet_id'].tolist()
# tweetids
len(tweetids)

2356

In [64]:
# List of dictionaries to build and convert to a DataFrame later
tweet_list = []
tweet_errors = []
start = timer()
for tweet in tweetids:
    try:
        full_status = api.get_status(tweet,wait_on_rate_limit=True,\
                                     wait_on_rate_limit_notify=True)
        tweet_list.append(full_status)
    except Exception as e:
        end = timer()
        print('No status found for tweetid ' + str(tweet))
        print('Time elapsed to error ' + str(end - start))
        tweet_errors.append(tweet)
print(len(tweet_list))
print (len(tweet_errors))

No status found for tweetid 888202515573088257
Time elapsed to error 4.159179584006779
No status found for tweetid 873697596434513921
Time elapsed to error 22.734724859998096
No status found for tweetid 869988702071779329
Time elapsed to error 27.523074962984538
No status found for tweetid 866816280283807744
Time elapsed to error 30.470067259011557
No status found for tweetid 861769973181624320
Time elapsed to error 35.37255413699313
No status found for tweetid 845459076796616705
Time elapsed to error 54.79257122299168
No status found for tweetid 842892208864923648
Time elapsed to error 57.50201636500424
No status found for tweetid 837012587749474308
Time elapsed to error 65.64513875899138
No status found for tweetid 827228250799742977
Time elapsed to error 85.8148105769942
No status found for tweetid 802247111496568832
Time elapsed to error 125.79686354199657
No status found for tweetid 775096608509886464
Time elapsed to error 172.05036198999733


Rate limit reached. Sleeping for: 705


No status found for tweetid 705066031337840642
Time elapsed to error 1014.6548392880068


Rate limit reached. Sleeping for: 186


2344
12


In [73]:
tweet_list[:1]

[Status(_api=<tweepy.api.API object at 0x109cb1b38>, _json={'created_at': 'Tue Aug 01 16:23:56 +0000 2017', 'id': 892420643555336193, 'id_str': '892420643555336193', 'text': "This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU", 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 892420639486877696, 'id_str': '892420639486877696', 'indices': [86, 109], 'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg', 'media_url_https': 'https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg', 'url': 'https://t.co/MgUWQ76dJU', 'display_url': 'pic.twitter.com/MgUWQ76dJU', 'expanded_url': 'https://twitter.com/dog_rates/status/892420643555336193/photo/1', 'type': 'photo', 'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'}, 'medium': {'w': 540, 'h': 528, 'resize': 'fit'}, 'small': {'w': 540, 'h': 528, 'resize': 'fit'}, 'large': {'w': 540, 'h': 528, 'resize': 'fit'}}}]}, 'exte

In [80]:
# Store tweet info in JSON format
data = dict(tweet_list)

with open('tweet_json', 'w') as outfile:  
    json.dump(data, outfile)

TypeError: cannot convert dictionary update sequence element #0 to a sequence