In [1]:
# Import packages needed to gather, clean, 
import pandas as pd
import numpy as np
import requests
import tweepy
import json
import glob
from config import *

## Gathering Data
* Import the weratedogs csv file into a dataframe

* Download udacity image predictor file from the the following URL: https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv

* Create a json file containing each tweet's retweet count and favorite ("like") count at minimum, and any additional data you find interesting. Using the tweet IDs in the WeRateDogs Twitter archive, query the Twitter API for each tweet's JSON data using Python's Tweepy library and store each tweet's entire set of JSON data in a file called tweet_json.txt file. Each tweet's JSON data should be written to its own line. Then read this .txt file line by line into a pandas DataFrame with (at minimum) tweet ID, retweet count, and favorite count.

In [2]:
# Pull in the twitter-archive-enhanced csv file downloaded from the udacity site.
df_ta = pd.read_csv('twitter-archive-enhanced.csv')
df_ta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [3]:
# Create a request to pull the image predictions tsv file from the udacity site. Take the image predictions file thats stored in
# check response status to ensure we don't have an error.
# the response output and open a tsv file to hold the data so it can be called later.
r = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')
r.raise_for_status()

with open('image_predictions.tsv', 'wb') as handle:
    for block in r.iter_content(1024):
        handle.write(block)

In [4]:
# Create dataframe for the image predictions file just downloaded
df_pred = pd.read_csv('image_predictions.tsv', sep='\t')

In [5]:
# Pull information on tweets using twitter api
# Pause loop to allow refresh of data pulling
# print twitter json data to file (tweet_json.txt)

auth = tweepy.OAuthHandler(consumer_token, consumer_token_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
df_error = []
with open('tweet_json.txt', 'w') as append:
    for index, row in df_ta.iterrows():
            try:
                tweet = api.get_status(row['tweet_id'], )
                json.dump(tweet._json, append)
                append.write('\n')
            except tweepy.TweepError as e:
                df_error.append({'tweet_id': row['tweet_id'], 'Error': (e.args[0][0]['code'], e.args[0][0]['message'])})
                pass
            
df_error = pd.DataFrame(df_error, columns = ['tweet_id', 'Error'])
df_error.to_csv('df_error.csv')

Rate limit reached. Sleeping for: 719
Rate limit reached. Sleeping for: 716


In [6]:
# Loop through tweet_json file and extract the tweeet_id, retweet_count, favorite_count from the tweet_json.txt file created
# in the previous step.
df_tweet_likes = []
tweet_file = open('tweet_json.txt')
for line in tweet_file:
    data = json.loads(line)
    tweet_id = data['id']
    retweet_count = data['retweet_count']
    favorite_count = data['favorite_count']
    df_tweet_likes.append({'tweet_id' : tweet_id
                          ,'favorite_count' : favorite_count
                          ,'retweet_count' : retweet_count})

df_tweet_likes = pd.DataFrame(df_tweet_likes, columns = ['tweet_id', 'favorite_count', 'retweet_count'])

In [7]:
# See how many likes you pulled. Compare to df_ta file.
print('There are {} records in the df_tweetd_likes, compared to {} records in the df_ta.'.format(len(df_tweet_likes), len(df_ta)))

There are 2342 records in the df_tweetd_likes, compared to 2356 records in the df_ta.


So we are missing 14 records between the two dataframes. I am sure this will change during the clean up step when I remove some of the retweets.

## Assess Data

In this section we will assess the data for quality and tidiness issues as outlined below.

#### Quality
Review each table and find the quality issues
i.e. Data formats, typos, data inconsistencies (abbr vs. full length), missing information, missing records

#### Tidiness
Breaking apart data in columns (email/phone), dog classification(pupper, doggo, ect), table structure

In [8]:
# Pull the info on the df_ta file and determine what kind of data types we are dealing with.
df_ta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

From this we can tell that we have a few data types that need to be cleaned up. Such as the timestamp fields

In [9]:
# Pull the info on the df_pred dataframe and determine what kind of data types we are dealing with
df_pred.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [10]:
df_tweet_likes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2342 entries, 0 to 2341
Data columns (total 3 columns):
tweet_id          2342 non-null int64
favorite_count    2342 non-null int64
retweet_count     2342 non-null int64
dtypes: int64(3)
memory usage: 55.0 KB


In [11]:
df_error.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 2 columns):
tweet_id    14 non-null int64
Error       14 non-null object
dtypes: int64(1), object(1)
memory usage: 304.0+ bytes


In [12]:
# Review the table and see what needs to be cleaned up
df_ta.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [13]:
# Review and locate the number of incorrect names
sorted(df_ta.name.unique())

['Abby',
 'Ace',
 'Acro',
 'Adele',
 'Aiden',
 'Aja',
 'Akumi',
 'Al',
 'Albert',
 'Albus',
 'Aldrick',
 'Alejandro',
 'Alexander',
 'Alexanderson',
 'Alf',
 'Alfie',
 'Alfy',
 'Alice',
 'Amber',
 'Ambrose',
 'Amy',
 'Amélie',
 'Anakin',
 'Andru',
 'Andy',
 'Angel',
 'Anna',
 'Anthony',
 'Antony',
 'Apollo',
 'Aqua',
 'Archie',
 'Arlen',
 'Arlo',
 'Arnie',
 'Arnold',
 'Arya',
 'Ash',
 'Asher',
 'Ashleigh',
 'Aspen',
 'Astrid',
 'Atlas',
 'Atticus',
 'Aubie',
 'Augie',
 'Autumn',
 'Ava',
 'Axel',
 'Bailey',
 'Baloo',
 'Balto',
 'Banditt',
 'Banjo',
 'Barclay',
 'Barney',
 'Baron',
 'Barry',
 'Batdog',
 'Bauer',
 'Baxter',
 'Bayley',
 'BeBe',
 'Bear',
 'Beau',
 'Beckham',
 'Beebop',
 'Beemo',
 'Bell',
 'Bella',
 'Belle',
 'Ben',
 'Benedict',
 'Benji',
 'Benny',
 'Bentley',
 'Berb',
 'Berkeley',
 'Bernie',
 'Bert',
 'Bertson',
 'Betty',
 'Beya',
 'Biden',
 'Bilbo',
 'Billl',
 'Billy',
 'Binky',
 'Birf',
 'Bisquick',
 'Blakely',
 'Blanket',
 'Blipson',
 'Blitz',
 'Bloo',
 'Bloop',
 'Blu',


In [14]:
# Count the number of retweets
len(df_ta.retweeted_status_id.unique())

182

In [15]:
df_ta.rating_numerator.unique()

array([  13,   12,   14,    5,   17,   11,   10,  420,  666,    6,   15,
        182,  960,    0,   75,    7,   84,    9,   24,    8,    1,   27,
          3,    4,  165, 1776,  204,   50,   99,   80,   45,   60,   44,
        143,  121,   20,   26,    2,  144,   88], dtype=int64)

In [16]:
df_ta.rating_denominator.unique()

array([ 10,   0,  15,  70,   7,  11, 150, 170,  20,  50,  90,  80,  40,
       130, 110,  16, 120,   2], dtype=int64)

In [17]:
len(df_ta.in_reply_to_status_id.unique())

78

In [18]:
len(df_ta.in_reply_to_user_id.unique())

32

In [19]:
df_tweet_likes.query('favorite_count == 0')

Unnamed: 0,tweet_id,favorite_count,retweet_count
31,886054160059072513,0,105
35,885311592912609280,0,18237
67,879130579576475649,0,6721
72,878404777348136964,0,1270
73,878316110768087041,0,6558
77,877611172832227328,0,80
90,874434818259525634,0,14572
95,873337748698140672,0,1572
106,871166179821445120,0,5678
120,868639477480148993,0,2103


In [20]:
from pprint import pprint
pprint(api.get_status(886054160059072513)._json)

{'contributors': None,
 'coordinates': None,
 'created_at': 'Sat Jul 15 02:45:48 +0000 2017',
 'entities': {'hashtags': [{'indices': [21, 26], 'text': 'BATP'}],
              'symbols': [],
              'urls': [{'display_url': 'twitter.com/dog_rates/stat…',
                        'expanded_url': 'https://twitter.com/dog_rates/status/886053434075471873',
                        'indices': [27, 50],
                        'url': 'https://t.co/WxwJmvjfxo'}],
              'user_mentions': [{'id': 19607400,
                                 'id_str': '19607400',
                                 'indices': [3, 13],
                                 'name': 'Oakland Athletics 🌳🐘⚾️',
                                 'screen_name': 'Athletics'}]},
 'favorite_count': 0,
 'favorited': False,
 'geo': None,
 'id': 886054160059072513,
 'id_str': '886054160059072513',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,

#### Quality Issues
* timestamp not listed as datetime field `df_ta`
* Multiple names not entered correctly (quite, such, a, not, O, just, an, a, all, actually, an, by, getting, his, incredibly, ect) `df_ta`
* remove retweets (182) `df_ta`
* remove tweets that had errors pulling data from twitter api `df_errors`, most likely due to a tweet or account being removed. (14)
* `df_tweet_likes` (166) without a favorite, yet when you visit the page you see they've been favorited.
* Source contains to much non-sense infomation, should be trimmed down to where the tweet came from. (iPhone, twitter, ect) `df_ta`
* Why so many 0 for favorite_count? Looking at data seems maybe from retweets? `df_tweet_likes`
* Dog name case on the `df_pred` table in columns p1, p2 and p3

#### Tidy Issues

* Multiple tables `df_ta`, `df_errors`, `df_tweet_likes` and `df_pred`
* Multiple columns for dog rating (doggo, floofer, pupper, puppo) `df_ta`