In [1]:
# Import packages needed to gather, clean, 
import pandas as pd
import numpy as np
import requests
import tweepy
import json
import glob
from config import *

## Gathering Data
* Import the weratedogs csv file into a dataframe

* Download udacity image predictor file from the the following URL: https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv

* Create a json file containing each tweet's retweet count and favorite ("like") count at minimum, and any additional data you find interesting. Using the tweet IDs in the WeRateDogs Twitter archive, query the Twitter API for each tweet's JSON data using Python's Tweepy library and store each tweet's entire set of JSON data in a file called tweet_json.txt file. Each tweet's JSON data should be written to its own line. Then read this .txt file line by line into a pandas DataFrame with (at minimum) tweet ID, retweet count, and favorite count.

In [2]:
df_ta = pd.read_csv('twitter-archive-enhanced.csv')
df_ta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [3]:
# Create a request to pull the image predictions tsv file from the udacity site. Take the image predictions file thats stored in
# check response status to ensure we don't have an error.
# the response output and open a tsv file to hold the data so it can be called later.
r = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')
r.raise_for_status()

with open('image_predictions.tsv', 'wb') as handle:
    for block in r.iter_content(1024):
        handle.write(block)

In [4]:
# Create dataframe for the image predictions
df_pred = pd.read_csv('image_predictions.tsv', sep='\t')

In [6]:
# Pull information on tweets using twitter api
# Pause loop to allow refresh of data pulling
# print twitter tweet json data to file

auth = tweepy.OAuthHandler(consumer_token, consumer_token_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
i = 0
df_error = []
with open('tweet_json.txt', 'w') as append:
    for index, row in df_ta.iterrows():
            try:
                tweet = api.get_status(row['tweet_id'], )
                json.dump(tweet._json, append)
                append.write('\n')
            except tweepy.TweepError as e:
                df_error.append({'tweet_id': row['tweet_id'], 'Error': (e.args[0][0]['code'], e.args[0][0]['message'])})
                i = i+1
                pass
            
df_error = pd.DataFrame(df_error, columns = ['tweet_id', 'Error'])

Rate limit reached. Sleeping for: 714


TypeError: string indices must be integers

In [7]:
# Loop through tweet_json file and extract the tweeet_id, retweet count, likes
df_tweet_likes = []
tweet_file = open('tweet_json.txt')
for line in tweet_file:
    text = line.split()
    data = json.loads(''.join(text))
    tweet_id = data['id']
    retweet_count = data['retweet_count']
    favorite_count = data['favorite_count']
    df_tweet_likes.append({'tweet_id' : tweet_id
                          ,'favorite_count' : favorite_count
                          ,'retweet_count' : retweet_count})

df_tweet_likes = pd.DataFrame(df_tweet_likes, columns = ['tweet_id', 'favorite_count', 'retweet_count'])

## Assess Data

#### Quality
Review each table and find the quality issues
i.e. Data formats, typos, data inconsistencies (abbr vs. full length), missing information, missing records

#### Tidiness
Breaking apart data in columns (email/phone), dog classification(pupper, doggo, ect), table structure

In [8]:
df_ta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [9]:
df_tweet_likes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1720 entries, 0 to 1719
Data columns (total 3 columns):
tweet_id          1720 non-null int64
favorite_count    1720 non-null int64
retweet_count     1720 non-null int64
dtypes: int64(3)
memory usage: 40.4 KB


In [12]:
df_error.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 2 columns):
tweet_id    14 non-null int64
Error       14 non-null object
dtypes: int64(1), object(1)
memory usage: 304.0+ bytes


#### Quality Issues
    * 