# WeRateDogs Tweeter Archive

## Gather

In [1]:
import pandas as pd
import numpy as np
import requests
#import tweepy

In [2]:
tweet = pd.read_csv('twitter-archive-enhanced.csv')

In [None]:
###
r = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')
if r.status_code == 200:
    with open('image_predictions.tsv', 'wb') as f:
        f.write(r.content)

In [3]:
image = pd.read_csv('image_predictions.tsv', sep = '\t')

In [None]:
###
consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True )

In [None]:
###
from timeit import default_timer as timer

start = timer()
tweet_id = []
retweet_count = []
favorite_count = []

for ids in image.tweet_id:
    try:
        tweet = api.get_status(ids, tweet_mode='extended')
        tweet_id.append(ids)
        retweet_count.append(tweet.retweet_count)
        favorite_count.append(tweet.favorite_count)
    except Exception as e:
        print(str(e))
        
end = timer()
print(end - start)    

In [None]:
###
file = open("tweet_json.txt", "w")
for index in range(len(tweet_id)):
    file.write(str(tweet_id[index]) + " " + str(retweet_count[index]) + " " + str(favorite_count[index]) + "\n")
file.close()

In [4]:
info = pd.read_csv('tweet_json.txt', sep=" ", header=None)
info.columns = ['tweet_id','retweet_count','favorite_count']

## Assess

In [None]:
tweet.head(10)

In [None]:
image.head(10)

In [None]:
info.head(10)

In [11]:
tweet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [None]:
image.info()

In [None]:
info.info()

In [None]:
all_columns = pd.Series(list(tweet) + list(image) + list(info))
all_columns[all_columns.duplicated()]

In [None]:
list(tweet)

In [None]:
list(image)

In [None]:
tweet[tweet['expanded_urls'].isnull()]

In [None]:
tweet.describe()

In [None]:
image.describe()

In [None]:
info.describe()

In [None]:
tweet.sample(5)

In [None]:
tweet.name.value_counts()

In [None]:
tweet.source.value_counts()

In [None]:
image.jpg_url.value_counts()

In [None]:
tweet[tweet.name.duplicated()]

In [None]:
tweet.rating_numerator.sort_values()

In [None]:
tweet.rating_denominator.sort_values()

In [None]:
len(tweet[(tweet['doggo'] == 'doggo')]) + len(tweet[(tweet['floofer'] == 'floofer')])  + len(tweet[(tweet['pupper'] == 'pupper')]) 
+ len(tweet[(tweet['puppo'] == 'puppo')]) 

#### Quality
##### `tweet` table
- in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, and retweeted_status_user_id are int not float
- Erroneous datatypes (retweeted_status_timestamp, timestamp, text, and source, tweet_id columns) 
- source has four categories
- name sometimes lower case, captial letter other times
- Missing expanded_urls (2297 instead of 2356)
- Missing in_reply_to_status_id and in_reply_to_user_id (78 instead of 2356)
- Missing retweeted_status_id, retweeted_status_user_id, and retweeted_status_timestamp (181 instead of 2356)
- actually, by, his, my, one, the, very are not dog name
- Multiple records for 'a, an' as a name
- Some rating_numerator and rating_denominator are extremely high
- Some rows have rating_numerator and rating_denominator but do not have expanded_urls
- rating_numerator and rating_denumerator fields should be floats

##### `image` table
- Duplicates in jpg_url
- Lowercase p1, p2, p3
- Missing records (2075 instead of 2356)

##### `info` table
- Missing records (2069 instead of 2075)

#### Tidiness
- Merge doggo, pupper, puppo, and floofer columns into one column in `tweet` table
- `info` table and `image` table should be part of the `tweet` table

## Clean

In [39]:
tweet_clean = tweet.copy()
image_clean = image.copy()
info_clean = info.copy()

### Missing Data

#### `tweet`: Missing expanded_urls (2297 instead of 2356), Missing in_reply_to_status_id and in_reply_to_user_id (78 instead of 2356), Missing retweeted_status_id, retweeted_status_user_id, and retweeted_status_timestamp (181 instead of 2356)

##### Define
- Remove tweets that don't have image url
- Remove replied tweets
- Remove retweets

##### Code

In [6]:
tweet_clean = tweet_clean[~tweet_clean.expanded_urls.isnull()]

##### Test

In [7]:
tweet_clean.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


##### Code

In [8]:
tweet_clean = tweet_clean[tweet_clean['in_reply_to_status_id'].isnull()]

##### Test

In [None]:
len(tweet_clean)

##### Code

In [9]:
tweet_clean = tweet_clean[tweet_clean['retweeted_status_id'].isnull()]

##### Test

In [None]:
len(tweet_clean)

##### Code

In [10]:
drop_columns = ['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 
                'retweeted_status_user_id', 'retweeted_status_timestamp']

tweet_clean = tweet_clean.drop(labels = drop_columns, axis = 1)

##### Test

In [None]:
tweet_clean.head()

### Quality

#### `tweet`: Erroneous datatypes (tweet_id, timestamp, text, and source columns) , Source has four categories

##### Define
- Change tweet_id to string data type
- Change timestamp to date data type
- Change text to string data type
- Change source to category data type
- Replace the four category source with short version

##### Code

In [40]:
tweet_clean.tweet_id = tweet_clean.tweet_id.astype(str) 
tweet_clean.text = tweet_clean.text.astype(str)
tweet_clean.source = tweet_clean.source.astype('category')
tweet_clean.timestamp = pd.to_datetime(tweet_clean.timestamp)

In [41]:
tweet_clean.source = tweet_clean.source.replace('<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'Twitter for iPhone')
tweet_clean.source = tweet_clean.source.replace('<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>', 'Vine Make a Scene')
tweet_clean.source = tweet_clean.source.replace('<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>', 'Twitter Web Client')
tweet_clean.source = tweet_clean.source.replace('<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>', 'Tweet Deck')

##### Test

In [14]:
tweet_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2094 entries, 0 to 2355
Data columns (total 12 columns):
tweet_id              2094 non-null object
timestamp             2094 non-null datetime64[ns]
source                2094 non-null object
text                  2094 non-null object
expanded_urls         2094 non-null object
rating_numerator      2094 non-null int64
rating_denominator    2094 non-null int64
name                  2094 non-null object
doggo                 2094 non-null object
floofer               2094 non-null object
pupper                2094 non-null object
puppo                 2094 non-null object
dtypes: datetime64[ns](1), int64(2), object(9)
memory usage: 212.7+ KB


In [None]:
tweet_clean.head()

#### `tweet`: rating_numerator and rating_denumerator fields should be floats

##### Define
- Find all tweets with decimal numerators in their text
- Replace integers with decimal numerators

##### Code

In [42]:
tweet_clean.rating_numerator = tweet_clean.rating_numerator.astype(float) 
tweet_clean.rating_denominator = tweet_clean.rating_denominator.astype(float)

In [73]:
tweet_clean[tweet_clean.text.str.contains(r'(\d+(\.\d+))\/(\d+)')].text.str.extract(r'(\d+(\.\d+))\/(\d+)')

  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app


Unnamed: 0,0,1,2
45,13.5,0.5,10
340,9.75,0.75,10
695,9.75,0.75,10
763,11.27,0.27,10
1689,9.5,0.5,10
1712,11.26,0.26,10


In [None]:
tweet_clean.loc(45,'rating_numerator') =  '13.5'
tweet_clean.loc(340,'rating_numerator') =  '9.75'
tweet_clean.loc(695,'rating_numerator') =  '9.75'
tweet_clean.loc(763,'rating_numerator') =  '11.27'
tweet_clean.loc(1689,'rating_numerator') =  '9.5'
tweet_clean.loc(1712,'rating_numerator') =  '11.26'

##### Test

In [66]:
tweet_clean[tweet_clean.rating_numerator == 13.5]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


#### `tweet`: Some rating_numerator and rating_denominator are extremely high

##### Define
- Calculate the rating based on rating_numerator divide rating_denominator
- Remove the rating greater than 2.0

##### Code

In [None]:
tweet_clean['rating'] = tweet_clean.rating_numerator / tweet_clean.rating_denominator

In [None]:
for dog_rate in tweet_clean.rating:
    if dog_rate > 2.0:
        print(dog_rate)

In [None]:
tweet_clean = tweet_clean[tweet_clean.rating < 2]

In [None]:
tweet_clean = tweet_clean.drop(['rating_numerator', 'rating_denominator'], axis=1)

##### Test

In [None]:
tweet_clean.rating.sort_values()

#### `tweet`: Multiple records for 'a, an' as a name, 'actually, by, his, my, one, the, very' are not dog name

##### Define
- Filter the text column to check if 'a','an', etc have corresponding name
- If not, change the name to 'None'

##### Code

In [82]:
Exception_names = ['a', 'an','actually', 'by', 'his', 'my', 'one', 'the', 'very', 'such','getting', 'world', 'mad']

In [83]:
tweet_clean['new_text'] = tweet_clean[tweet_clean['name'].isin(Exception_names)].text

In [84]:
# tweet_clean['new_text'][~tweet_clean['new_text'].isnull()]
tweet_clean['new_text'] = tweet_clean['new_text'].str.extract('\is (?:a|an|the) (.*?)\.').str.split().str[-1]

  from ipykernel import kernelapp as app


In [85]:
tweet_clean['new_text'] = tweet_clean['new_text'].fillna(tweet_clean['name'])

In [87]:
# tweet_clean.loc(1527,'new_text') =  'lobster'
# tweet_clean.loc(1603,'new_text') =  'pupper'
# tweet_clean.loc(1797,'new_text') =  'pupper'
# tweet_clean.loc(1815,'new_text') =  'None'
# tweet_clean.loc(2037,'new_text') =  'None'
# tweet_clean.loc(2212,'new_text') =  'vine'
# tweet_clean.loc(1146,'new_text') =  'pupper'
# tweet_clean.loc(1457,'new_text') =  'pupper'
# tweet_clean.loc(2019,'new_text') =  'None'
# tweet_clean.loc(1138,'new_text') =  'pupper'
# tweet_clean.loc(1025,'new_text') =  'Kangaroo'
# tweet_clean.loc(1362,'new_text') =  'Seal'
# tweet_clean.loc(2204,'new_text') =  'Berta'
# tweet_clean.loc(1724,'new_text') =  'None'
# tweet_clean.loc(1071,'new_text') =  'Beaver'
# tweet_clean.loc(992,'new_text') =  'Quizno'
# tweet_clean.loc(1095,'new_text') =  'pupper'
# tweet_clean.loc(1897,'new_text') =  'Columbia'
# tweet_clean.loc(1923,'new_text') =  'Baklava'
# tweet_clean.loc(852,'new_text') =  'Zoey'
# tweet_clean.loc(1120,'new_text') =  'doggo'
# tweet_clean.loc(819,'new_text') =  'Kangaroo'
# tweet_clean.loc(1031,'new_text') =  'Seal'
# tweet_clean.loc(1097,'new_text') =  'Turtle'
# tweet_clean.loc(1385,'new_text') =  'snake'
# tweet_clean.loc(2333,'new_text') =  'Parthenon'
# tweet_clean.loc(2335,'new_text') =  'Episcopalian'
# tweet_clean.loc(2345,'new_text') =  'dog'
# tweet_clean.loc(2346,'new_text') =  'Paul'
# tweet_clean.loc(2047,'new_text') =  'terrier'
# tweet_clean.loc(2348,'new_text') =  'bear'
# tweet_clean.loc(2349,'new_text') =  'dog'

tweet_clean.new_text[1527] = tweet_clean.new_text[1527].replace('the', 'lobster')
tweet_clean.new_text[1603] = tweet_clean.new_text[1603].replace('the', 'pupper')
tweet_clean.new_text[1797] = tweet_clean.new_text[1797].replace('the', 'pupper')
tweet_clean.new_text[1815] = tweet_clean.new_text[1815].replace('the', 'None')
tweet_clean.new_text[2037] = tweet_clean.new_text[2037].replace('the', 'None')
tweet_clean.new_text[2212] = tweet_clean.new_text[2212].replace('the', 'vine')
tweet_clean.new_text[1146] = tweet_clean.new_text[1146].replace('just', 'pupper')
tweet_clean.new_text[1457] = tweet_clean.new_text[1457].replace('just', 'pupper')
tweet_clean.new_text[2019] = tweet_clean.new_text[2019].replace('just', 'None')
tweet_clean.new_text[1138] = tweet_clean.new_text[1138].replace('all', 'pupper')
tweet_clean.new_text[1025] = tweet_clean.new_text[1025].replace('an', 'Kangaroo')
tweet_clean.new_text[1362] = tweet_clean.new_text[1362].replace('an', 'Seal')
tweet_clean.new_text[2204] = tweet_clean.new_text[2204].replace('an', 'Berta')
tweet_clean.new_text[1724] = tweet_clean.new_text[1724].replace('by', 'None')
tweet_clean.new_text[1071] = tweet_clean.new_text[1071].replace('getting', 'Beaver')
tweet_clean.new_text[992] = tweet_clean.new_text[992].replace('his', 'Quizno')
tweet_clean.new_text[1095] = tweet_clean.new_text[1095].replace('mad', 'pupper')
tweet_clean.new_text[1897] = tweet_clean.new_text[1897].replace('mix', 'Columbia')
tweet_clean.new_text[1923] = tweet_clean.new_text[1923].replace('mix', 'Baklava')
tweet_clean.new_text[852] = tweet_clean.new_text[852].replace('my', 'Zoey')
tweet_clean.new_text[1120] = tweet_clean.new_text[1120].replace('this', 'doggo')
tweet_clean.new_text[819] = tweet_clean.new_text[819].replace('very', 'Kangaroo')
tweet_clean.new_text[1031] = tweet_clean.new_text[1031].replace('very', 'Seal')
tweet_clean.new_text[1097] = tweet_clean.new_text[1097].replace('very', 'Turtle')
tweet_clean.new_text[1385] = tweet_clean.new_text[1385].replace('very', 'snake')
tweet_clean.new_text[1382] = tweet_clean.new_text[1382].replace('world', 'pupper')
tweet_clean.new_text[2333] = tweet_clean.new_text[2333].replace('an', 'Parthenon')
tweet_clean.new_text[2335] = tweet_clean.new_text[2335].replace('an', 'Episcopalian')
tweet_clean.new_text[2345] = tweet_clean.new_text[2345].replace('the', 'dog')
tweet_clean.new_text[2346] = tweet_clean.new_text[2346].replace('the', 'Paul')
tweet_clean.new_text[2347] = tweet_clean.new_text[2347].replace('wheels', 'terrier')
tweet_clean.new_text[2348] = tweet_clean.new_text[2348].replace('mix', 'bear')
tweet_clean.new_text[2349] = tweet_clean.new_text[2349].replace('an', 'dog')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

In [88]:
tweet_clean.name = tweet_clean.new_text
tweet_clean = tweet_clean.drop(['new_text'], axis=1)

In [92]:
tweet_clean['name'].replace('None', np.nan, inplace=True)
tweet_clean['name'].replace('dog', np.nan, inplace=True)

##### Test

In [93]:
tweet_clean['name'].value_counts()

Charlie      12
Lucy         11
Oliver       11
Cooper       11
Penny        10
Lola         10
Tucker       10
Winston       9
Bo            9
Sadie         8
Toby          7
Bailey        7
Daisy         7
Buddy         7
Stanley       6
Milo          6
Rusty         6
Dave          6
Leo           6
Jack          6
Scout         6
Bella         6
Koda          6
Oscar         6
Jax           6
Chester       5
Alfie         5
Phil          5
George        5
Louis         5
             ..
Franq         1
Creg          1
Maisey        1
see           1
Edgar         1
Bronte        1
Zara          1
Meatball      1
Lili          1
Grizzie       1
Mollie        1
Teddy         1
Snoop         1
Lenox         1
Jeremy        1
Zooey         1
Todo          1
Kanu          1
Beaver        1
Ember         1
Steve         1
Laika         1
Socks         1
Eazy          1
Kane          1
cow           1
Finnegus      1
Alexander     1
Guinea        1
Opie          1
Name: name, Length: 1012

#### `image`: Duplicates in jpg_url

##### Define
- Find the duplicates in jpg_url
- Remove the duplicate rows

##### Code

In [None]:
image_clean = image_clean[~image_clean.duplicated(['jpg_url'], keep=False)]

##### Test

In [None]:
image_clean.info()

#### `tweet`: Name sometimes lower case, captial letter other times
#### `image`: Lowercase p1, p2, p3

##### Define
- Lower case name column in 'tweet' table
- Lower case p1, p2, p3 columns in 'image' table

##### Code

In [None]:
tweet_clean['name'] = tweet_clean['name'].str.lower()
image_clean['p1'] = image_clean['p1'].str.lower()
image_clean['p2'] = image_clean['p2'].str.lower()
image_clean['p3'] = image_clean['p3'].str.lower()

##### Test

In [None]:
tweet_clean.sample(5)

In [None]:
image_clean.sample(5)

### Tidiness

#### Merge doggo, pupper, puppo, and floofer columns into one column in `tweet` table

##### Define
- Concatenate the doggo, floofer, pupper, and puppo columns to a breed column.
- Drop the doggo, floofer, pupper, and puppo columns.

##### Code

In [None]:
def breed_concat(row):
    breeds = []
    breed_list = ['doggo', 'floofer', 'pupper', 'puppo']
    
    for breed in breed_list:
        if row[breed] != 'None':
            breeds.append(breed)

    return ' '.join(sorted(breeds))

tweet_clean['breed'] = tweet_clean[['doggo', 'floofer', 'pupper', 'puppo']].apply(breed_concat, axis=1)

In [None]:
tweet_clean = tweet_clean.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1)

In [None]:
tweet_clean.breed = tweet_clean.breed.astype('category')

##### Test

In [None]:
tweet_clean.sample(50)

#### `info` table and `image` table should be part of the `tweet` table

##### Define
Merge the info table and image table to the treatments table, joining on tweet_id.

##### Code

In [None]:
image_clean = pd.merge(image_clean, info_clean,
                            on=['tweet_id'], how='left')

In [None]:
tweet_clean = pd.merge(tweet_clean, image_clean,
                            on=['tweet_id'], how='left')

##### Test

In [None]:
tweet_clean.info()

In [None]:
image_clean[image_clean['retweet_count'].isnull()]

In [None]:
tweet_clean.sample(5)

## Store

In [None]:
tweet_clean.to_csv('twitter_archive_master.csv', index = False)

## Analyse

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
archive = pd.read_csv('twitter_archive_master.csv')

#### Rating Distribution

In [None]:
plt.hist(x = archive.rating, bins = 30)
plt.xlim(0,1.4)
plt.xlabel('Rating');
plt.ylabel('Count');
plt.title('Rating Distribution');
plt.savefig('rating_distribution.png')
plt.show()

Most people rates the dog above 1.0. Nearly 500 people rates dog at 1.2.

#### Dog Breed

In [None]:
counts = archive["breed"].value_counts()
plt.plot(range(len(counts)), counts)
plt.xticks(range(len(counts)), counts.index)
plt.xlabel('Count');
plt.ylabel('Dog Breed');
plt.title('Dog Breed Distribution');
plt.savefig('breed_distribution.png')
plt.show()

Most of tweet dog breed is pupper, then doggo and puppo. Floofer has the smallest amount.

In [None]:
archive.boxplot(column='rating', by='breed')
plt.xlabel('Breed');
plt.ylabel('Rating');
plt.title('Rating by Breed Type');
plt.savefig('rating_breed.png')
plt.show()

Rating of puppo is generally higher than rest of breed type. Pupper has the rating lower than dogoo and floofer.

In [None]:
archive["timestamp"] = pd.to_datetime(archive["timestamp"])
archive.set_index('timestamp', inplace=True)

In [None]:
archive[['rating']].plot(style = '.', alpha = 0.6)
plt.title('Rating Over the Time')
plt.xlabel('Date')
plt.ylabel('Rating')
plt.savefig('rating_time.png')
plt.show()

From Dec 2015 to Oct 2016, We still can see people rate the dog below 1.0. However, people seems like they always rate dog above 1.0, and the rating score 1.2 is the most popular score. But has less rating score is 1.4. 

#### Retweet Count & Favorite Count

In [None]:
plt.scatter(x = archive.retweet_count, y = archive.favorite_count)
plt.xlim(0, 20000)
plt.ylim(0, 60000)
plt.xlabel('Retweet Count');
plt.ylabel('Favorite Count');
plt.title('Retweet Count vs. Favorite Count');
plt.savefig('retweet_favorite.png')
plt.show()

The retweet count and favorite count show linear relationship.

In [None]:
archive[['favorite_count', 'retweet_count']].plot(style = '.', alpha = 0.4)
plt.title('Favorites and Retweets with Time')
plt.xlabel('Date')
plt.ylabel('Count')
plt.savefig('favorite_retweet_time.png')
plt.show()

Number of fovorite counts increased after 2016. Some even has counts over 100,000. 
Retweet counts was very stable over the time. Only June 2016 it reached to the peak (nearly 80,000 counts).

#### Top 10 dogs

In [None]:
top10_dogs = archive.loc[archive.p1.isin(archive.p1.value_counts().head(10).index)]

In [None]:
top10_mean = top10_dogs.groupby('p1').mean()
top10_mean

In [None]:
sns.set_style('white')
favorite_top10_dogs = sns.barplot(x = 'favorite_count', y = 'p1', data = top10_dogs)
favorite_top10_dogs.set_ylabel('Top 10 Breed Types')
favorite_top10_dogs.set_xlabel('Average Favorite Count')
favorite_top10_dogs.set_title('Favorite Count vs. Dog Breeds')

fig = favorite_top10_dogs.get_figure()
fig.savefig('favorite_top10_dogs.png')

Samoyed gets the highest average favorite counts among top 10 popular dogs.
Pug gets the lowest average favorite counts among top 10 popular dogs.

In [None]:
sns.set_style('white')
retweet_top10_dogs = sns.barplot(x = 'retweet_count', y = 'p1', data = top10_dogs)
retweet_top10_dogs.set_ylabel('Top 10 Breed Types')
retweet_top10_dogs.set_xlabel('Average Retweet Count')
retweet_top10_dogs.set_title('Retweet Count vs. Dog Breeds')

fig = retweet_top10_dogs.get_figure()
fig.savefig('retweet_top10_dogs.png')

Samoyed gets the highest average retweet counts among top 10 popular dogs.
Pug gets the lowest average retweet counts among top 10 popular dogs.

In [None]:
sns.set_style('white')
rating_top10_dogs = sns.barplot(x = 'rating', y = 'p1', data = top10_dogs)
rating_top10_dogs.set_ylabel('Top 10 Breed Types')
rating_top10_dogs.set_xlabel('Average Rating')
rating_top10_dogs.set_title('Retweet Count vs. Dog Breeds')

fig = rating_top10_dogs.get_figure()
fig.savefig('rating_top10_dogs.png')

Average ratings of the top 10 dogs are all over 1.0. Among them, samoyed can get the highest average rating which is almost 1.2.