In [1]:
# library imports
import pandas as pd
import numpy as np
import requests
import tweepy
import json
import re
import matplotlib.pyplot as plt

In [2]:
# read twitter-archive-enhanced csv to df
archive_df = pd.read_csv('Data/twitter-archive-enhanced.csv')

In [3]:
# download image predictions tsv file and read it to a df
dl_url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
filename = dl_url.split('/')[-1]

response = requests.get(dl_url)

with open('Data/' + filename, 'wb') as infile:
    infile.write(response.content)
    
prediction_df = pd.read_csv('Data/' + filename, sep='\t')

In [4]:
# read twitter config file
with open('twitter_config.json') as json_file:
    json_data = json.load(json_file)

# assign keys to variables
access_token = json_data['access_token']
access_token_secret = json_data['access_token_secret']
consumer_key = json_data['consumer_key']
consumer_secret = json_data['consumer_secret']

In [5]:
# creat api object
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

In [7]:
# get API response and store it into a list
tweet_ids = list(archive_df.tweet_id)
tweet_list = []
for tweet_id in tweet_ids:
    try:
        status = api.get_status(tweet_id, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
        tweet_list.append(status._json)
    except Exception as e:
        print(e, str(tweet_id))

2356

In [32]:
len(tweet_ids), len(tweet_list)

(2356, 2342)

In [31]:
# save it to tweet_json.txt
with open('Data/tweet_json.txt', 'w') as file:
    for i in tweet_list:
        file.write(json.dumps(i))
        file.write('\n')

In [27]:
# what to do with missing ids?

# missing_ids = ['754011816964026368','770743923962707968','771004394259247104','775096608509886464','802247111496568832',
#               '827228250799742977','837012587749474308','842892208864923648','845459076796616705','861769973181624320',
#               '866816280283807744','869988702071779329','873697596434513921','888202515573088257']

# for tweet_id in missing_ids:
#     try:
#         status = api_response.get_status(tweet_id, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
#         tweet_list.append(status._json)
#     except Exception as e:
#         print(e, str(tweet_id))

In [46]:
# read tweet_json.txt to store into DF
list_to_insert = []

with open('Data/tweet_json.txt', 'r') as infile:
    for i, line in enumerate(infile):
        tweet = json.loads(line)
        list_to_insert.append({'tweet_id': tweet['id'],
                            'retweets': tweet['retweet_count'],
                            'favorites': tweet['favorite_count']})

In [47]:
# check if the reading from file was successful
len(list_to_insert) == len(tweet_list)

True

In [53]:
tweet_df = pd.DataFrame(list_to_insert, columns=['tweet_id', 'retweets', 'favorites'])

In [89]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2342 entries, 0 to 2341
Data columns (total 3 columns):
tweet_id     2342 non-null int64
retweets     2342 non-null int64
favorites    2342 non-null int64
dtypes: int64(3)
memory usage: 55.0 KB


## Assess

We have now gathered 3 dataframes:
- `archive_df` from arhive downloaded from udacity 
- `prediction_df` from url
- `tweet_df` from twitter api response content

In [114]:
# increase column width for better readability
pd.set_option('display.max_colwidth', -1)

In [115]:
# create copies of each dataframes
archive_df_clean = archive_df.copy()
prediction_df_clean = prediction_df.copy()
tweet_df_clean = tweet_df.copy()

In [116]:
# merge archive_df and tweet_df
archive_tweet_df = pd.merge(left=archive_df_clean, right=tweet_df_clean, left_on='tweet_id', right_on='tweet_id', how='inner')
archive_tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2342 entries, 0 to 2341
Data columns (total 19 columns):
tweet_id                      2342 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2342 non-null object
source                        2342 non-null object
text                          2342 non-null object
retweeted_status_id           168 non-null float64
retweeted_status_user_id      168 non-null float64
retweeted_status_timestamp    168 non-null object
expanded_urls                 2283 non-null object
rating_numerator              2342 non-null int64
rating_denominator            2342 non-null int64
name                          2342 non-null object
doggo                         2342 non-null object
floofer                       2342 non-null object
pupper                        2342 non-null object
puppo                         2342 non-null object
retweets                      23

In [117]:
# check for retweets. Logic: non-null retweeded_status_id rows are the retweets
archive_tweet_df[~archive_tweet_df.retweeted_status_id.isnull()].count()

tweet_id                      168
in_reply_to_status_id         0  
in_reply_to_user_id           0  
timestamp                     168
source                        168
text                          168
retweeted_status_id           168
retweeted_status_user_id      168
retweeted_status_timestamp    168
expanded_urls                 167
rating_numerator              168
rating_denominator            168
name                          168
doggo                         168
floofer                       168
pupper                        168
puppo                         168
retweets                      168
favorites                     168
dtype: int64

In [118]:
# check for dog names
archive_tweet_df.name.value_counts()

None            738
a               55 
Charlie         11 
Oliver          11 
Lucy            11 
Cooper          11 
Tucker          10 
Penny           10 
Lola            10 
Winston         9  
Bo              9  
Sadie           8  
the             8  
Buddy           7  
an              7  
Daisy           7  
Bailey          7  
Toby            7  
Koda            6  
Oscar           6  
Jax             6  
Stanley         6  
Rusty           6  
Leo             6  
Bella           6  
Jack            6  
Scout           6  
Milo            6  
Dave            6  
Alfie           5  
               ..  
infuriating     1  
Link            1  
Bloo            1  
Karma           1  
Stephanus       1  
Brownie         1  
Maisey          1  
Dixie           1  
Bruiser         1  
Andru           1  
Stephan         1  
Kona            1  
Callie          1  
unacceptable    1  
Monty           1  
Champ           1  
Snoop           1  
Lenox           1  
Jeffri          1  


In [119]:
# check for names
archive_tweet_df.name.sort_values()

1007    Abby        
1021    Abby        
924     Ace         
1919    Acro        
1313    Adele       
1920    Aiden       
76      Aja         
471     Akumi       
807     Al          
862     Albert      
1940    Albert      
140     Albus       
403     Albus       
1101    Aldrick     
2032    Alejandro   
366     Alexander   
1175    Alexanderson
477     Alf         
1602    Alfie       
359     Alfie       
2224    Alfie       
845     Alfie       
651     Alfie       
50      Alfy        
1687    Alice       
196     Alice       
1481    Amber       
1320    Ambrose     
2064    Amy         
2181    Amélie      
         ...        
1081    mad         
839     my          
327     not         
974     not         
1733    officially  
1192    old         
361     one         
979     one         
1922    one         
911     one         
188     quite       
164     quite       
2312    quite       
2016    space       
21      such        
1801    the         
2198    the  

In [122]:
# check if the dog name is indeed not there or extraction failed
archive_tweet_df.loc[839]

tweet_id                      765395769549590528                                                                                                                    
in_reply_to_status_id         NaN                                                                                                                                   
in_reply_to_user_id           NaN                                                                                                                                   
timestamp                     2016-08-16 03:52:26 +0000                                                                                                             
source                        <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>                                                    
text                          This is my dog. Her name is Zoey. She knows I've been rating other dogs. She's not happy. 13/10 no bias at all https://t.co/ep1NkYoiwB
retweeted_

Dog names are tweeted with uppercase, so lowercase names are not real names

In [123]:
# get lowercase dog names
archive_tweet_df[archive_tweet_df.name.str.islower()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,retweets,favorites
21,887517139158093824,,,2017-07-19 03:39:09 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",I've yet to rate a Venezuelan Hover Wiener. This is such an honor. 14/10 paw-inspiring af (IG: roxy.thedoxy) https://t.co/20VrLAA8ba,,,,https://twitter.com/dog_rates/status/887517139158093824/video/1,14,10,such,,,,,11622,45905
55,881536004380872706,,,2017-07-02 15:32:16 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Here is a pupper approaching maximum borkdrive. Zooming at never before seen speeds. 14/10 paw-inspiring af \n(IG: puffie_the_chow) https://t.co/ghXBIIeQZF,,,,https://twitter.com/dog_rates/status/881536004380872706/video/1,14,10,a,,,pupper,,15906,49125
164,859196978902773760,,,2017-05-02 00:04:57 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",We only rate dogs. This is quite clearly a smol broken polar bear. We'd appreciate if you only send dogs. Thank you... 12/10 https://t.co/g2nSyGenG9,,,,https://twitter.com/dog_rates/status/859196978902773760/video/1,12,10,quite,,,,,31353,91822
188,855459453768019968,,,2017-04-21 16:33:22 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","Guys, we only rate dogs. This is quite clearly a bulbasaur. Please only send dogs. Thank you... 12/10 human used pet, it's super effective https://t.co/Xc7uj1C64x",,,,"https://twitter.com/dog_rates/status/855459453768019968/photo/1,https://twitter.com/dog_rates/status/855459453768019968/photo/1",12,10,quite,,,,,8638,30786
327,832645525019123713,,,2017-02-17 17:38:57 +0000,"<a href=""http://twitter.com"" rel=""nofollow"">Twitter Web Client</a>",There's going to be a dog terminal at JFK Airport. This is not a drill. 10/10 \nhttps://t.co/dp5h9bCwU7,,,,http://us.blastingnews.com/news/2017/02/jfk-announces-its-first-ever-ark-oasis-animal-terminal-001480161.html?sbdht=_pM1QUzk3wsdTxcmMoRPV7FWYYlsNKcFRcYSY7OmeHnOXA4NtUM6PLQ2_,10,10,not,,,,,568,3109
361,828650029636317184,,,2017-02-06 17:02:17 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","Occasionally, we're sent fantastic stories. This is one of them. 14/10 for Grace https://t.co/bZ4axuH6OK",,,,"https://twitter.com/dog_rates/status/828650029636317184/photo/1,https://twitter.com/dog_rates/status/828650029636317184/photo/1,https://twitter.com/dog_rates/status/828650029636317184/photo/1",14,10,one,,,,,1480,10167
533,806219024703037440,,,2016-12-06 19:29:28 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",We only rate dogs. Please stop sending in non-canines like this Freudian Poof Lion. This is incredibly frustrating... 11/10 https://t.co/IZidSrBvhi,,,,https://twitter.com/dog_rates/status/806219024703037440/photo/1,11,10,incredibly,,,,,1339,6944
639,792913359805018113,,,2016-10-31 02:17:31 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Here is a perfect example of someone who has their priorities in order. 13/10 for both owner and Forrest https://t.co/LRyMrU7Wfq,,,,"https://twitter.com/dog_rates/status/792913359805018113/photo/1,https://twitter.com/dog_rates/status/792913359805018113/photo/1,https://twitter.com/dog_rates/status/792913359805018113/photo/1,https://twitter.com/dog_rates/status/792913359805018113/photo/1",13,10,a,,,,,4492,15588
672,788552643979468800,,,2016-10-19 01:29:35 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",RT @dog_rates: Say hello to mad pupper. You know what you did. 13/10 would pet until no longer furustrated https://t.co/u1ulQ5heLX,7.363926e+17,4.196984e+09,2016-05-28 03:04:00 +0000,"https://vine.co/v/iEggaEOiLO3,https://vine.co/v/iEggaEOiLO3",13,10,mad,,,pupper,,8059,0
749,778396591732486144,,,2016-09-21 00:53:04 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",RT @dog_rates: This is an East African Chalupa Seal. We only rate dogs. Please only send in dogs. Thank you... 10/10 https://t.co/iHe6liLwWR,7.030419e+17,4.196984e+09,2016-02-26 02:20:37 +0000,"https://twitter.com/dog_rates/status/703041949650034688/photo/1,https://twitter.com/dog_rates/status/703041949650034688/photo/1",10,10,an,,,,,13598,0


Visual inspection suggests that sometimes the dog names are there but not retrieved as they were not in 'This is ...' order. But many tweets do contain dog names in format 'named ...' or 'name is ...'

In [127]:
# check for records without images
archive_tweet_df[archive_tweet_df.expanded_urls.isnull()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,retweets,favorites
29,886267009285017600,8.862664e+17,2281182000.0,2017-07-15 16:51:35 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",@NonWhiteHat @MayhewMayhem omg hello tanner you are a scary good boy 12/10 would pet with extreme caution,,,,,12,10,,,,,,4,116
54,881633300179243008,8.81607e+17,47384430.0,2017-07-02 21:58:53 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",@roushfenway These are good dogs but 17/10 is an emotional impulse rating. More like 13/10s,,,,,17,10,,,,,,7,126
63,879674319642796034,8.795538e+17,3105441000.0,2017-06-27 12:14:36 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",@RealKentMurphy 14/10 confirmed,,,,,14,10,,,,,,10,311
111,870726314365509632,8.707262e+17,16487760.0,2017-06-02 19:38:25 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",@ComplicitOwl @ShopWeRateDogs &gt;10/10 is reserved for dogs,,,,,10,10,,,,,,3,120
144,863427515083354112,8.634256e+17,77596200.0,2017-05-13 16:15:35 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","@Jack_Septic_Eye I'd need a few more pics to polish a full analysis, but based on the good boy content above I'm leaning towards 12/10",,,,,12,10,,,,,,99,2292
174,857214891891077121,8.571567e+17,180671000.0,2017-04-26 12:48:51 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",@Marc_IRL pixelated af 12/10,,,,,12,10,,,,,,19,236
180,856330835276025856,,,2017-04-24 02:15:55 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",RT @Jenna_Marbles: @dog_rates Thanks for rating my cermets 14/10 wow I'm so proud I watered them so much,8.563302e+17,66699013.0,2017-04-24 02:13:14 +0000,,14,10,,,,,,709,0
181,856288084350160898,8.56286e+17,279281000.0,2017-04-23 23:26:03 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",@xianmcguire @Jenna_Marbles Kardashians wouldn't be famous if as a society we didn't place enormous value on what they do. The dogs are very deserving of their 14/10,,,,,14,10,,,,,,17,530
183,855862651834028034,8.558616e+17,194351800.0,2017-04-22 19:15:32 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",@dhmontgomery We also gave snoop dogg a 420/10 but I think that predated your research,,,,,420,10,,,,,,28,358
184,855860136149123072,8.558585e+17,13615720.0,2017-04-22 19:05:32 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","@s8n You tried very hard to portray this good boy as not so good, but you have ultimately failed. His goodness shines through. 666/10",,,,,666,10,,,,,,1085,5166


In [132]:
# check for rating denominator. usually it is 10
archive_tweet_df.rating_denominator.value_counts()

10     2320
50     3   
80     2   
11     2   
20     2   
2      1   
16     1   
40     1   
70     1   
15     1   
90     1   
110    1   
120    1   
130    1   
150    1   
170    1   
7      1   
0      1   
Name: rating_denominator, dtype: int64

In [134]:
# so check for other records where rating_denominator is not 10
archive_tweet_df[archive_tweet_df.rating_denominator % 10 != 0]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,retweets,favorites
334,832088576586297345,8.320875e+17,30582080.0,2017-02-16 04:45:50 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",@docmisterio account started on 11/15/15,,,,,11,15,,,,,,2,71
507,810984652412424192,,,2016-12-19 23:06:23 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Meet Sam. She smiles 24/7 &amp; secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx,,,,"https://www.gofundme.com/sams-smile,https://twitter.com/dog_rates/status/810984652412424192/photo/1",24,7,Sam,,,,,1593,5769
1054,740373189193256964,,,2016-06-08 02:41:38 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https://t.co/XAVDNDaVgQ",,,,"https://twitter.com/dog_rates/status/740373189193256964/photo/1,https://twitter.com/dog_rates/status/740373189193256964/photo/1,https://twitter.com/dog_rates/status/740373189193256964/photo/1,https://twitter.com/dog_rates/status/740373189193256964/photo/1",9,11,,,,,,14467,36874
1648,682962037429899265,,,2016-01-01 16:30:13 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Darrel. He just robbed a 7/11 and is in a high speed police chase. Was just spotted by the helicopter 10/10 https://t.co/7EsP8LmSp5,,,,https://twitter.com/dog_rates/status/682962037429899265/photo/1,7,11,Darrel,,,,,18198,39311
1649,682808988178739200,6.827884e+17,4196984000.0,2016-01-01 06:22:03 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","I'm aware that I could've said 20/16, but here at WeRateDogs we are very professional. An inconsistent rating scale is simply irresponsible",,,,,20,16,,,,,,197,1880
2321,666287406224695296,,,2015-11-16 16:11:11 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is an Albanian 3 1/2 legged Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv,,,,https://twitter.com/dog_rates/status/666287406224695296/photo/1,1,2,an,,,,,64,149


In [136]:
# check numerator
archive_tweet_df.rating_numerator.value_counts()

12      553
11      464
10      461
13      345
9       157
8       102
7       55 
14      52 
5       37 
6       32 
3       19 
4       17 
1       9  
2       9  
420     2  
0       2  
15      2  
75      2  
80      1  
20      1  
24      1  
26      1  
44      1  
50      1  
60      1  
165     1  
84      1  
88      1  
144     1  
182     1  
143     1  
666     1  
960     1  
1776    1  
17      1  
27      1  
45      1  
99      1  
121     1  
204     1  
Name: rating_numerator, dtype: int64

In [105]:
archive_tweet_df[archive_tweet_df.text.str.contains('puppo')]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,retweets,favorites
12,889665388333682689,,,2017-07-25 01:55:32 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Here's a puppo that seems to be on the fence about something haha no but seriously someone help her. 13/10 https://t.co/BxvuXk0UCm,,,,https://twitter.com/dog_rates/status/889665388333682689/photo/1,13,10,,,,,puppo,10012,47778
14,889531135344209921,,,2017-07-24 17:02:04 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Stuart. He's sporting his favorite fanny pack. Secretly filled with bones only. 13/10 puppared puppo #BarkWeek https://t.co/y70o6h3isq,,,,https://twitter.com/dog_rates/status/889531135344209921/photo/1,13,10,Stuart,,,,puppo,2224,14987
84,876120275196170240,,,2017-06-17 16:52:05 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","Meet Venti, a seemingly caffeinated puppoccino. She was just informed the weekend would include walks, pats and scritches. 13/10 much excite https://t.co/ejExJFq3ek",,,,https://twitter.com/dog_rates/status/876120275196170240/photo/1,13,10,Venti,,,,,4688,27735
93,874012996292530176,,,2017-06-11 21:18:31 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Sebastian. He can't see all the colors of the rainbow, but he can see that this flag makes his human happy. 13/10 #PrideMonth puppo https://t.co/XBE0evJZ6V",,,,"https://twitter.com/dog_rates/status/874012996292530176/photo/1,https://twitter.com/dog_rates/status/874012996292530176/photo/1",13,10,Sebastian,,,,puppo,10482,34467
104,871879754684805121,,,2017-06-06 00:01:46 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Say hello to Lassie. She's celebrating #PrideMonth by being a splendid mix of astute and adorable. Proudly supupporting her owner. 13/10 https://t.co/uK6PNyeh9w,,,,"https://twitter.com/dog_rates/status/871879754684805121/photo/1,https://twitter.com/dog_rates/status/871879754684805121/photo/1",13,10,Lassie,,,,,11292,37972
126,867421006826221569,,,2017-05-24 16:44:18 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Shikha. She just watched you drop a skittle on the ground and still eat it. Could not be less impressed. 12/10 superior puppo https://t.co/XZlZKd73go,,,,https://twitter.com/dog_rates/status/867421006826221569/photo/1,12,10,Shikha,,,,puppo,2570,16241
130,866686824827068416,,,2017-05-22 16:06:55 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Lili. She can't believe you betrayed her with bath time. Never looking you in the eye again. 12/10 would puppologize profusely https://t.co/9b9J46E86Z,,,,"https://twitter.com/dog_rates/status/866686824827068416/photo/1,https://twitter.com/dog_rates/status/866686824827068416/photo/1",12,10,Lili,,,,,3558,19486
163,859607811541651456,,,2017-05-03 03:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Sorry for the lack of posts today. I came home from school and had to spend quality time with my puppo. Her name is Zoey and she's 13/10 https://t.co/BArWupFAn0,,,,https://twitter.com/dog_rates/status/859607811541651456/photo/1,13,10,,,,,puppo,1626,18946
167,858843525470990336,,,2017-05-01 00:40:27 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",I have stumbled puppon a doggo painting party. They're looking to be the next Pupcasso or Puppollock. All 13/10 would put it on the fridge https://t.co/cUeDMlHJbq,,,,https://twitter.com/dog_rates/status/858843525470990336/photo/1,13,10,,doggo,,,,3601,15847
186,855851453814013952,,,2017-04-22 18:31:02 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Here's a puppo participating in the #ScienceMarch. Cleverly disguising her own doggo agenda. 13/10 would keep the planet habitable for https://t.co/cMhq16isel,,,,https://twitter.com/dog_rates/status/855851453814013952/photo/1,13,10,,doggo,,,puppo,18537,46609


In [80]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2342 entries, 0 to 2341
Data columns (total 3 columns):
tweet_id     2342 non-null int64
retweets     2342 non-null int64
favorites    2342 non-null int64
dtypes: int64(3)
memory usage: 55.0 KB


In [68]:
prediction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [85]:
# check for duplicates in prediction_df
for i in list(prediction_df.tweet_id.duplicated()):
    if i:
        print('duplicated')

In [135]:
# check for the float numerator
archive_tweet_df[archive_tweet_df.text.str.contains(r"(\d+\.\d*\/\d+)")]

  


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,retweets,favorites
44,883482846933004288,,,2017-07-08 00:28:19 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Bella. She hopes her smile made you smile. If not, she is also offering you her favorite monkey. 13.5/10 https://t.co/qjrljjt948",,,,"https://twitter.com/dog_rates/status/883482846933004288/photo/1,https://twitter.com/dog_rates/status/883482846933004288/photo/1",5,10,Bella,,,,,9918,45617
332,832215909146226688,,,2017-02-16 13:11:49 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","RT @dog_rates: This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wu…",7.867091e+17,4196984000.0,2016-10-13 23:23:56 +0000,https://twitter.com/dog_rates/status/786709082849828864/photo/1,75,10,Logan,,,,,6790,0
685,786709082849828864,,,2016-10-13 23:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wuqaPS",,,,https://twitter.com/dog_rates/status/786709082849828864/photo/1,75,10,Logan,,,,,6790,19714
753,778027034220126208,,,2016-09-20 00:24:34 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Sophie. She's a Jubilant Bush Pupper. Super h*ckin rare. Appears at random just to smile at the locals. 11.27/10 would smile back https://t.co/QFaUiIHxHq,,,,https://twitter.com/dog_rates/status/778027034220126208/photo/1,27,10,Sophie,,,pupper,,1805,7090
1675,681340665377193984,6.813394e+17,4196984000.0,2015-12-28 05:07:27 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",I've been told there's a slight possibility he's checking his mirror. We'll bump to 9.5/10. Still a menace,,,,,5,10,,,,,,302,1744
1698,680494726643068929,,,2015-12-25 21:06:00 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Here we have uncovered an entire battalion of holiday puppers. Average of 11.26/10 https://t.co/eNm2S6p9BD,,,,https://twitter.com/dog_rates/status/680494726643068929/photo/1,26,10,,,,,,524,1816


### Quality Issues
- Incorrect dtype to columns in `archive_tweet_df`. e.g. timestamp -> datetime
- Records representing retweets
- Subsequently, columns related to retweeted_status_*
- Incorrect dog names 
- No data values in dog names columns are 'None' as string dtype
- Records with no images (`extended_url`)
- Float denominator in the `text` not correctly represented in column `rating_denominator`
- denominator that are not 10 (or multiples of 10)
- No data values as None in doggo, floofer, pupper, puppo

### Tidiness
- Consolidate doggo, floofer, pupper, puppo to one column `dog_stages`
- Potentially only keep `p1`, `p1_conf` and `p1_dog` columns in `prediction_df`
- Merge `archive_df` and `tweet_df` (already done!)

## Clean

#### Define
Assign correct datatypes to columns in `archive_tweet_df`
- timestamp -> datetime
- rating_numerator -> float

#### Code

In [140]:
# timestamp to datetime dtype
archive_tweet_df.timestamp = archive_tweet_df.timestamp.apply(pd.to_datetime)

In [145]:
# rating_numerator to float
archive_tweet_df.rating_numerator = archive_tweet_df.rating_numerator.astype(float)

#### Test

In [148]:
archive_tweet_df.dtypes

tweet_id                      int64         
in_reply_to_status_id         float64       
in_reply_to_user_id           float64       
timestamp                     datetime64[ns]
source                        object        
text                          object        
retweeted_status_id           float64       
retweeted_status_user_id      float64       
retweeted_status_timestamp    object        
expanded_urls                 object        
rating_numerator              float64       
rating_denominator            int64         
name                          object        
doggo                         object        
floofer                       object        
pupper                        object        
puppo                         object        
retweets                      int64         
favorites                     int64         
dtype: object

#### Define

#### Code

#### Test

#### Define

#### Code

#### Test

#### Define

#### Code

#### Test

#### Define

#### Code

#### Test

#### Define

#### Code

#### Test

#### Define

#### Code

#### Test

#### Define

#### Code

#### Test

#### Define

#### Code

#### Test